def chooseBestFeatureToSplit(dataSet): ''' :param dataSet: 数据集 :return: ''' numFeatures = len(dataSet[0]) - 1 #特征列的长度,-1为label baseEntropy = calcShannonEnt(dataSet) #计算数据集的香农熵 bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] #创建一个list包含所有数据的第i个feature uniqueVals = set(featList) #转变为set格式 newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) #遍历featList中的所有feature,对每个feture划分一次数据集 prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) #计算当前feature的香农熵 infoGain = baseEntropy - newEntropy #计算熵差,信息增益 if (infoGain > bestInfoGain): #计算最大信息增益 bestInfoGain = infoGain bestFeature = i return bestFeature #返回最好的feature
递归构建决策树
1.得到数据集 2.最好feature划分 3.递归划分
当处理了所有feature后,类标签仍然不唯一时,采用多数表决方式决定子节点分类
1 2 3 4 5 6 7 8
def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0]
利用递归构建tree
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] #数据集的所有类标签 if classList.count(classList[0]) == len(classList): return classList[0] #当类标签完全相同返回该类标签 if len(dataSet[0]) == 1: #当所有属性都处理完,label仍然不唯一时,采用表决方式 return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] #当前数据集选取的最好特征变量 myTree = {bestFeatLabel: {}} del(labels[bestFeat]) #删除用过的feature featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) #利用递归构建tree return myTree