摘要:由于近期学业繁重,所以我就不说废话了,直接上代码简单的决策树示例运行效果代码定义文本框和箭头格式画树使用文本注解绘制树节点绘制带箭头的注解在父子节点间填充文本信息创建数据集计算给定数据的香农熵熵值越高,混合的数据越多,越无序我们可
由于近期学业繁重QAQ,所以我就不说废话了,直接上代码~
简单的决策树示例 运行效果from math import log import operator import matplotlib.pyplot as plt #定义文本框和箭头格式 decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") #画树 #使用文本注解绘制树节点 #绘制带箭头的注解 def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.ax1.annotate(nodeTxt,xy=parentPt, xycoords="axes fraction", xytext=centerPt,textcoords="axes fraction", va="center",ha="center",bbox=nodeType, arrowprops=arrow_args) #在父子节点间填充文本信息 def plotMidText(cntrPt,parentPt,txtString): xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0] yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1] createPlot.ax1.text(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs=getNumLeafs(myTree) depth=getTreeDepth(myTree) firstStr=list(myTree.keys())[0] cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict=myTree[firstStr] plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff), cntrPt,leafNode) plotMidText((plotTree.xOff,plotTree.yOff), cntrPt,str(key)) plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD def createPlot(inTree): fig=plt.figure(1,facecolor="white") fig.clf() axprops=dict(xticks=[],yticks=[]) createPlot.ax1=plt.subplot(111,frameon=False,**axprops) plotTree.totalW=float(getNumLeafs(inTree)) plotTree.totalD=float(getNumLeafs(inTree)) plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0; plotTree(inTree,(0.5,1.0),"") plt.show() #创建数据集 def createDataSet(): dataSet=[[1,1,"yes"], [1,1,"yes"], [1,0,"no"], [0,1,"no"], [0,1,"no"]] labels=["no surfacing","flippers"] return dataSet,labels #计算给定数据的香农熵 #熵值越高,混合的数据越多,越无序 #我们可以在数据集中添加更多的分类 def calcShannonEnt(dataSet): numEntries=len(dataSet) #数据字典,键值为最后一列的数值"yes"or"no" labelCounts={} for featVec in dataSet: #为所有可能分类创建字典 #"yes"or"no" currentLabel=featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 for key in labelCounts: prob=float(labelCounts[key])/numEntries #以2为㡳求对数 shannonEnt-=prob*log(prob,2) return shannonEnt #按照给定特征划分数据集 #输入的参数为:待划分的数据集, #划分数据集的特征(第几列), #特征的返回值(这一列的值为多少) #返回的是符合这一列的值的每一行, #并且将这一列的数据去掉了 def splitDataSet(dataSet,axis,value): retDataSet=[] #遍历整个数据集 #featVec:[1, 1, "yes"] for featVec in dataSet: #print("featVec:") #print(featVec) #抽取其中符合特征的 #featVec[axis]表示[1, 1, "yes"]中的第axis+1个 if featVec[axis]==value: #保存这一列前面的数据 reducedFeatVec=featVec[:axis] #print("reducedFeatVec:") #print(reducedFeatVec) #保存这一列后面的数据 reducedFeatVec.extend(featVec[axis+1:]) #print("reducedFeatVec:") #print(reducedFeatVec) retDataSet.append(reducedFeatVec) #print("retDataSet:") #print(retDataSet) return retDataSet #选择最好的数据集划分方式 def chooseBestFeatureToSplit(dataSet): #numFeatures:2 numFeatures=len(dataSet[0])-1 #计算香农熵 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 #i:0,1 for i in range(numFeatures): #取出dataSet的第i列 featList=[example[i] for example in dataSet] #print("featList:") #print(featList) #弄成一个set,去掉其中相同的元素 uniqueVals=set(featList) #print("uniqueVals:") #print(uniqueVals) newEntropy=0.0 for value in uniqueVals: #按照第i列,值为value的去划分 subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) #计算划分后的熵值 newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy #判断是否更优 if(infoGain>bestInfoGain): bestInfoGain=infoGain bestFeature=i #返回划分的最优类别 #表示按照第i列去划分 return bestFeature #传入的是分类名称的列表 #返回出现次数最多的分类的名称 def majorityCnt(classList): #创建字典,键值为classList中唯一值 #字典的值为classList中每隔标签出现的频率 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 #按照字典值的顺序从大到小排序 sortedClassCount=sorted(classCount,iteritems(), key=operator.itemgetter(1),reverse=True) #返回出现次数最多的分类的名称 return sortedClassCount[0][0] #创建树 #传入参数为数据集与标签列表 def createTree(dataSet,labels): #得到分类名称的标签"yes"or"no" #["yes", "yes", "no", "no", "no"] classList=[example[-1] for example in dataSet] #print("classList:") #print(classList) #递归结束的第一个条件 #所有的类标签完全相同 if classList.count(classList[0])==len(classList): return classList[0] #递归结束的第二个条件 #使用完了所有的特征,仍然不能将数 #据集划分成仅包含唯一类别的分组 #此时无法简单地返回唯一的类标签, #直接返回出现次数最多的类标签 if len(dataSet[0])==1: return majorityCnt(classList) #bestFeat是最好的划分方式对应的列的下标 bestFeat=chooseBestFeatureToSplit(dataSet) #labels中这一列信息对应的类别名称 bestFeatLabel=labels[bestFeat] #树 myTree={bestFeatLabel:{}} #将labels中的这一类别delete del(labels[bestFeat]) #这一类别对应的列的值 featValues=[example[bestFeat] for example in dataSet] #print("featValues:") #print(featValues) #set 去掉列中相同的值 uniqueVals=set(featValues) for value in uniqueVals: #去掉最优类别后剩下的类别 subLabels=labels[:] #print("subLabels:") #print(subLabels) #print("bestFeatLabel:") #print(bestFeatLabel) #print(value) #myTree["no surfacing"][0] #myTree["no surfacing"][2] #...... myTree[bestFeatLabel][value]=createTree( #按照第bestFeat列,值为value的去划分 splitDataSet(dataSet,bestFeat,value),subLabels) return myTree #获取叶节点的数目 def getNumLeafs(myTree): numLeafs=0 firstStr=list(myTree.keys())[0] secondDir=myTree[firstStr] for key in secondDir.keys(): #子节点为字典类型,则该结点也是一个判断结点 #需要递归调用getNumLeafs函数 if type(secondDir[key]).__name__=="dict": numLeafs+=getNumLeafs(secondDir[key]) #该结点为叶子节点,叶子数+1 else: numLeafs+=1 return numLeafs #获取树的层数 def getTreeDepth(myTree): maxDepth=0 firstStr=list(myTree.keys())[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": thisDepth=1+getTreeDepth(secondDict[key]) else: thisDepth=1 if thisDepth>maxDepth:maxDepth=thisDepth return maxDepth def main(): dataSet,labels=createDataSet() chooseBestFeatureToSplit(dataSet) #{"no surfacing": {0: "no", 1: {"flippers": {0: "no", 1: "yes"}}}} myTree=createTree(dataSet,labels) print("myTree:") print(myTree) createPlot(myTree) #i=getNumLeafs(myTree) #print(i) #i=getTreeDepth(myTree) #print(i) #i=chooseBestFeatureToSplit(dataSet) #print(i) #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #增加一个类别后再测试信息熵,发现熵值增大 #dataSet[0][-1]="maybe" #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #retDataSet=splitDataSet(dataSet,0,1) #print("retDataSet:") #print(retDataSet) #retDataSet=splitDataSet(dataSet,0,0) #print("retDataSet:") #print(retDataSet) if __name__=="__main__": main()
from math import log import operator import matplotlib.pyplot as plt #定义文本框和箭头格式 decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") #画树 #使用文本注解绘制树节点 #绘制带箭头的注解 def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.ax1.annotate(nodeTxt,xy=parentPt, xycoords="axes fraction", xytext=centerPt,textcoords="axes fraction", va="center",ha="center",bbox=nodeType, arrowprops=arrow_args) #在父子节点间填充文本信息 def plotMidText(cntrPt,parentPt,txtString): xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0] yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1] createPlot.ax1.text(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs=getNumLeafs(myTree) depth=getTreeDepth(myTree) firstStr=list(myTree.keys())[0] cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict=myTree[firstStr] plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff), cntrPt,leafNode) plotMidText((plotTree.xOff,plotTree.yOff), cntrPt,str(key)) plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD def createPlot(inTree): fig=plt.figure(1,facecolor="white") fig.clf() axprops=dict(xticks=[],yticks=[]) createPlot.ax1=plt.subplot(111,frameon=False,**axprops) plotTree.totalW=float(getNumLeafs(inTree)) plotTree.totalD=float(getNumLeafs(inTree)) plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0; plotTree(inTree,(0.5,1.0),"") plt.show() #创建数据集 def createDataSet(): dataSet=[[1,1,"yes"], [1,1,"yes"], [1,0,"no"], [0,1,"no"], [0,1,"no"]] labels=["no surfacing","flippers"] return dataSet,labels #计算给定数据的香农熵 #熵值越高,混合的数据越多,越无序 #我们可以在数据集中添加更多的分类 def calcShannonEnt(dataSet): numEntries=len(dataSet) #数据字典,键值为最后一列的数值"yes"or"no" labelCounts={} for featVec in dataSet: #为所有可能分类创建字典 #"yes"or"no" currentLabel=featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 for key in labelCounts: prob=float(labelCounts[key])/numEntries #以2为㡳求对数 shannonEnt-=prob*log(prob,2) return shannonEnt #按照给定特征划分数据集 #输入的参数为:待划分的数据集, #划分数据集的特征(第几列), #特征的返回值(这一列的值为多少) #返回的是符合这一列的值的每一行, #并且将这一列的数据去掉了 def splitDataSet(dataSet,axis,value): retDataSet=[] #遍历整个数据集 #featVec:[1, 1, "yes"] for featVec in dataSet: #print("featVec:") #print(featVec) #抽取其中符合特征的 #featVec[axis]表示[1, 1, "yes"]中的第axis+1个 if featVec[axis]==value: #保存这一列前面的数据 reducedFeatVec=featVec[:axis] #print("reducedFeatVec:") #print(reducedFeatVec) #保存这一列后面的数据 reducedFeatVec.extend(featVec[axis+1:]) #print("reducedFeatVec:") #print(reducedFeatVec) retDataSet.append(reducedFeatVec) #print("retDataSet:") #print(retDataSet) return retDataSet #选择最好的数据集划分方式 def chooseBestFeatureToSplit(dataSet): #numFeatures:2 numFeatures=len(dataSet[0])-1 #计算香农熵 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 #i:0,1 for i in range(numFeatures): #取出dataSet的第i列 featList=[example[i] for example in dataSet] #print("featList:") #print(featList) #弄成一个set,去掉其中相同的元素 uniqueVals=set(featList) #print("uniqueVals:") #print(uniqueVals) newEntropy=0.0 for value in uniqueVals: #按照第i列,值为value的去划分 subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) #计算划分后的熵值 newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy #判断是否更优 if(infoGain>bestInfoGain): bestInfoGain=infoGain bestFeature=i #返回划分的最优类别 #表示按照第i列去划分 return bestFeature #传入的是分类名称的列表 #返回出现次数最多的分类的名称 def majorityCnt(classList): #创建字典,键值为classList中唯一值 #字典的值为classList中每隔标签出现的频率 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 #按照字典值的顺序从大到小排序 sortedClassCount=sorted(classCount,iteritems(), key=operator.itemgetter(1),reverse=True) #返回出现次数最多的分类的名称 return sortedClassCount[0][0] #创建树 #传入参数为数据集与标签列表 def createTree(dataSet,labels): #得到分类名称的标签"yes"or"no" #["yes", "yes", "no", "no", "no"] classList=[example[-1] for example in dataSet] #print("classList:") #print(classList) #递归结束的第一个条件 #所有的类标签完全相同 if classList.count(classList[0])==len(classList): return classList[0] #递归结束的第二个条件 #使用完了所有的特征,仍然不能将数 #据集划分成仅包含唯一类别的分组 #此时无法简单地返回唯一的类标签, #直接返回出现次数最多的类标签 if len(dataSet[0])==1: return majorityCnt(classList) #bestFeat是最好的划分方式对应的列的下标 bestFeat=chooseBestFeatureToSplit(dataSet) #labels中这一列信息对应的类别名称 bestFeatLabel=labels[bestFeat] #树 myTree={bestFeatLabel:{}} #将labels中的这一类别delete del(labels[bestFeat]) #这一类别对应的列的值 featValues=[example[bestFeat] for example in dataSet] #print("featValues:") #print(featValues) #set 去掉列中相同的值 uniqueVals=set(featValues) for value in uniqueVals: #去掉最优类别后剩下的类别 subLabels=labels[:] #print("subLabels:") #print(subLabels) #print("bestFeatLabel:") #print(bestFeatLabel) #print(value) #myTree["no surfacing"][0] #myTree["no surfacing"][4] #...... myTree[bestFeatLabel][value]=createTree( #按照第bestFeat列,值为value的去划分 splitDataSet(dataSet,bestFeat,value),subLabels) return myTree #获取叶节点的数目 def getNumLeafs(myTree): numLeafs=0 firstStr=list(myTree.keys())[0] secondDir=myTree[firstStr] for key in secondDir.keys(): #子节点为字典类型,则该结点也是一个判断结点 #需要递归调用getNumLeafs函数 if type(secondDir[key]).__name__=="dict": numLeafs+=getNumLeafs(secondDir[key]) #该结点为叶子节点,叶子数+1 else: numLeafs+=1 return numLeafs #获取树的层数 def getTreeDepth(myTree): maxDepth=0 firstStr=list(myTree.keys())[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": thisDepth=1+getTreeDepth(secondDict[key]) else: thisDepth=1 if thisDepth>maxDepth:maxDepth=thisDepth return maxDepth #使用决策树的分类函数 def classify(inputTree,featLabels,testVec): firstStr=list(inputTree.keys())[0] secondDict=inputTree[firstStr] #将标签字符串转换为索引 featIndex=featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex]==key: if type(secondDict[key]).__name__=="dict": classLabel=classify(secondDict[key],featLabels,testVec) else: classLabel=secondDict[key] return classLabel #使用pickle模块存储决策树 def storeTree(inputTree,filename): import pickle fw=open(filename,"wb") pickle.dump(inputTree,fw) fw.close() #使用pickle模块加载树 def grabTree(filename): import pickle fr=open(filename,"rb") return pickle.load(fr) #使用决策树预测隐形眼镜类型 def predictTypes(): fr=open("lenses.txt") #[["young", "myope", "no", "reduced", "no lenses"], ...] lenses=[inst.strip().split(" ") for inst in fr.readlines()] #print(lenses) #标签 lensesLabels=["age","prescript","astigmatic","tearRate"] #创建决策树 lensesTree=createTree(lenses,lensesLabels) print(lensesTree) #画树 createPlot(lensesTree) def main(): predictTypes() #dataSet,labels=createDataSet() #print(labels) #chooseBestFeatureToSplit(dataSet) #{"no surfacing": {0: "no", 1: {"flippers": {0: "no", 1: "yes"}}}} #myTree=createTree(dataSet,labels) #storeTree(myTree,"classifierStorage.txt") #Tree=grabTree("classifierStorage.txt") #print(Tree) #createPlot(Tree) #print("myTree:") #print(myTree) #createPlot(myTree) #labels2=["no surfacing", "flippers"] #i=classify(myTree,labels2,[1,1]) #print(i) #i=getNumLeafs(myTree) #print(i) #i=getTreeDepth(myTree) #print(i) #i=chooseBestFeatureToSplit(dataSet) #print(i) #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #增加一个类别后再测试信息熵,发现熵值增大 #dataSet[0][-1]="maybe" #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #retDataSet=splitDataSet(dataSet,0,1) #print("retDataSet:") #print(retDataSet) #retDataSet=splitDataSet(dataSet,0,0) #print("retDataSet:") #print(retDataSet) if __name__=="__main__": main()数据
lenses.txt文件内容如下
young myope no reduced no lenses young myope no normal soft young myope yes reduced no lenses young myope yes normal hard young hyper no reduced no lenses young hyper no normal soft young hyper yes reduced no lenses young hyper yes normal hard pre myope no reduced no lenses pre myope no normal soft pre myope yes reduced no lenses pre myope yes normal hard pre hyper no reduced no lenses pre hyper no normal soft pre hyper yes reduced no lenses pre hyper yes normal no lenses presbyopic myope no reduced no lenses presbyopic myope no normal no lenses presbyopic myope yes reduced no lenses presbyopic myope yes normal hard presbyopic hyper no reduced no lenses presbyopic hyper no normal soft presbyopic hyper yes reduced no lenses presbyopic hyper yes normal no lenses
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/43459.html
摘要:总言言之,决策树第一个是需要从大量的已存在的样本中推出可供做决策的规则,同时,这个规则应该避免做无谓的损耗。算法原理构造决策树的关键步骤是分裂属性。这时分裂属性可能会遇到三种不同的情况对离散值生成非二叉决策树。对离散值生成二叉决策树。 算法背景 决策树故名思意是用于基于条件来做决策的,而它运行的逻辑相比一些复杂的算法更容易理解,只需按条件遍历树就可以了,需要花点心思的是理解如何建立决策...
摘要:决策树分支转存写代码的方法今天是周日,我还在倒腾决策树,然后发现了一个不用装软件也能倒的方法,而且更简单。刚开始看视频的时候是看的的视频,讲的真差,太模糊了,不适合我。 决策树分支dot转存pdf 1、写代码的方法 今天是周日,我还在倒腾决策树,然后发现了一个不用装软件也能倒pdf的方法,而且更简单。参照了这个中文的文档实现:http://sklearn.apachecn.org/c....
阅读 2396·2021-08-18 10:21
阅读 2502·2019-08-30 13:45
阅读 2139·2019-08-30 13:16
阅读 2044·2019-08-30 12:52
阅读 1327·2019-08-30 11:20
阅读 2591·2019-08-29 13:47
阅读 1604·2019-08-29 11:22
阅读 2743·2019-08-26 12:11