随机森林RandomForest
基本思想
- 随机森林的结果是依赖于多棵决策树的结果,这是一种集成学习的思想
- 随机思想:随机选取一定数量的特征(需调参),再从中选取最优的几组特征(GINI、熵、OOB)
- 票选的思想:最终的结果是多棵决策树票选的结果
基本步骤
- 随机抽样训练决策树(选取最优的几组决策树)
- 随机选取属性做节点分裂属性(选取最优的几组属性)
- 重复步骤2,直到不能再分裂
- 建立大量决策树,形成森林
GINI指数(评估特征的重要性)
-
用随机森林进行特征重要性评估的思想其实很简单,说白了就是看看每个特征在随机森林中的每颗树上做了多大的贡献,然后取个平均值,最后比一比特征之间的贡献大小。
-
Gini指数越小表示集合中被选中的样本被分错的概率越小,也就是说集合的纯度越高,反之,集合越不纯。
-
Gini = 1 – Σ (Pi)2 for i=1 to number of classes(pi is squared probabilities of each class)
# 计算GINI,gini表示不纯度,越小越纯,越大越不纯
def calGini(dataSet):
# 创建字典,统计该数据集中的各个标签的数量
lables = calculateDiffCount(dataSet)
# 计算gini
length = len(dataSet)
gini = 1.0
for key in lables.keys():
gini -= (lables[key]/length)**2
return gini
实现(基于GINI指数,Pima数据集)
- Pima印第安人数据集
【1】Pregnancies:怀孕次数
【2】Glucose:葡萄糖
【3】BloodPressure:血压 (mm Hg)
【4】SkinThickness:皮层厚度 (mm)
【5】Insulin:胰岛素 2小时血清胰岛素(mu U / ml
【6】BMI:体重指数 (体重/身高)^2
【7】DiabetesPedigreeFunction:糖尿病谱系功能
【8】Age:年龄 (岁)
【9】Outcome:类标变量 (0或1)
6 148 72 35 0 33.6 0.627 50 1
1 85 66 29 0 26.6 0.351 31 0
8 183 64 0 0 23.3 0.672 32 1
1 89 66 23 94 28.1 0.167 21 0
0 137 40 35 168 43.1 2.288 33 1
5 116 74 0 0 25.6 0.201 30 0
3 78 50 32 88 31 0.248 26 1
......
import numpy as np
import random
# 一个是随机选取样本,一个是随机选取特征
# 决策树的个数、特征属性的个数、递归次数(即决策树的深度)
# 决策树部分
# 计算GINI,gini表示不纯度,越小越纯,越大越不纯
def calGini(dataSet):
# 创建字典,统计该数据集中的各个标签的数量
lables = calculateDiffCount(dataSet)
# 计算gini
length = len(dataSet)
gini = 1.0
for key in lables.keys():
gini -= (lables[key]/length)**2
return gini
# 对数据集dataSet,对于第col列特征,根据value划分为两个数据集
def splitData(dataSet,col,value):
data1 = []
data2 = []
for line in dataSet:
if(line[col] >= value):
data1.append(line)
else:
data2.append(line)
return data1,data2
# 数标签
def calculateDiffCount(datas):
results = {}
for data in datas:
if data[-1] not in results:
results[data[-1]] = 1
else:
results[data[-1]] += 1
return results
# 递归调用,选取最佳的特征和最佳特征当中的最佳分割值
def BuildCartDecisionTree(dataSet,features,maxDepth,depth):
if(depth >= maxDepth):
return calculateDiffCount(dataSet)
depth+=1
# 目前的gini
currentgini = calGini(dataSet)
# 列数
column_length = len(dataSet[0])
# 行数(样本数)
rows_length = len(dataSet)
# giniIndex的差
best_gini_gain = 0.0
best_value = None
best_set = None
for col in features:
values = set([x[col] for x in dataSet])
for value in values:
data1,data2 = splitData(dataSet,col,value)
p = len(data1)/rows_length
gini = p*calGini(data1)+(1-p)*calGini(data2)
gain = currentgini-gini
if(gain > best_gini_gain):
best_gini_gain = gain
best_value = (col,value)
best_set = (data1,data2)
if(best_gini_gain > 0.0):
trueBranch = BuildCartDecisionTree(best_set[0], features,maxDepth,depth)
falseBranch = BuildCartDecisionTree(best_set[1], features,maxDepth,depth)
return (best_value[0],
best_value[1],
falseBranch,
trueBranch)
else:
return calculateDiffCount(dataSet)
def getFeatures(dataSet,n_features):
return random.sample(range(len(dataSet[0])-1),n_features)
def loadData():
# dataMat = []; labelMat = []
alldataMat = []
pima = open("../Pima.csv")
for line in pima:
lineArr = line.strip().split(',')
float_map = map(float,lineArr)
float_list = list(float_map)
# dataMat.append(float_list[0:-1])
# labelMat.append(int(lineArr[-1]))
float_list[-1] = int(lineArr[-1])
alldataMat.append(float_list)
return alldataMat
# 切分数据集,以便交叉验证
# input(数据集,个数)
def spiltDataSet(dataSet,bag_nums):
spiltedDataSet = []
oneBagLength = int(len(dataSet)/bag_nums)
data = []
for index in random.sample(range(len(dataSet)),len(dataSet)):
if(len(data) < oneBagLength):
data.append(dataSet[index])
else:
spiltedDataSet.append(data)
data = []
return spiltedDataSet
# 树的个数,随机选取的特征数,树的最大深度,随机分成的数据集的个数
def buildRandomForest(n_trees,n_features,max_tree_depth,n_dataSets):
RForest = []
# 载入数据
dataMat = loadData()
# 随机分数据集
spiltedDataSet = spiltDataSet(dataMat,n_dataSets)
# 构建n_trees棵树的森林
for i in range(n_trees):
features = getFeatures(dataMat,n_features)
tree = BuildCartDecisionTree(spiltedDataSet[random.randint(0,len(spiltedDataSet)-1)],features,max_tree_depth,1)
RForest.append(tree)
return RForest
def classifyByTree(tree,data):
lengthOfNode = len(tree)
if(lengthOfNode == 4):
feature = tree[0]
value = tree[1]
data_value = data[feature]
if(data_value >= value):
return classifyByTree(tree[3],data)
else:
return classifyByTree(tree[2],data)
else:
return max(tree,key=tree.get)
def classifyByForest(forest,data):
results = {}
for tree in forest:
result = classifyByTree(tree,data)
if result not in results:
results[result] = 1
else:
results[result] += 1
return max(results,key=results.get)
def acc():
forest = buildRandomForest(200,3,10,5)
dataMat = loadData()
count = 0
for index in random.sample(range(len(dataMat)),len(dataMat)-1):
result = classifyByForest(forest,dataMat[index])
if(result == dataMat[index][-1]):
count += 1
return float(count/len(dataMat))