“This is the 7th day of my participation in the Gwen Challenge in November. Check out the details: The Last Gwen Challenge in 2021.”

Decision tree principle

Decision tree is a model of learning a tree structure from training data. This model belongs to the discriminant model. A decision tree is a tree-like structure that divides data by making a series of decisions, similar to choosing a series of questions. The decision-making process of decision tree is to test the corresponding characteristic attributes of the items to be classified from the root node, and select the output branches according to their values until the leaf node. Finally, the classification of the leaf node is taken as the decision result.

Decision tree algorithm is a kind of inductive classification algorithm. It mines useful rules through learning the training set and uses them to predict the new data. The basic algorithm of decision tree induction is greedy algorithm, which builds decision tree from top down. Each step takes the best algorithm in the current state. During decision tree generation, the attribute selection metric is shutdown.

One of the decision tree algorithms (ID3 algorithm)

The algorithm supports classification model, multi-cross tree structure, feature selection through information gain, and does not support continuous value and missing value processing, pruning and multiple use of feature attributes.

The general algorithm flow of ID3 is as follows:

ID3 algorithm also has defects, ID3 has no pruning strategy, easy to overfit; The information gain criterion has a preference for features with a large number of values, and the information gain of features like “numbered” is close to 1. Can only be used to deal with discrete distribution features; Missing values are not considered.

ID3 algorithm code implementation

The selected data set is to see if the weather goes out today. After the training is completed, the decision tree is drawn.

from math import log import pandas as pd import numpy as np import operator from matplotlib.font_manager import FontProperties import matplotlib.pyplot as plt path = 'D:\MachineL\TreeTest.csv' df = pd.read_csv(path) dataSet=[] dataSet= np.array(df.loc[:,:]) print(dataSet) labels = list(df.columns.values) labels.pop() print(labels) def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): LabelCounts [currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for labelCounts: prob = float(labelCounts[key]) / numEntries shannonEnt -= prob * log(prob, 2) return shannonEnt # return empirical entropy def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): NumFeatures = len(dataSet[0]) -1 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0 bestFeature = -1 for I in range(numFeatures): FeatList = [example[I] for example in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet) / float(len(dataSet)) newEntropy += prob * calcShannonEnt((subDataSet)) infoGain = BaseEntropy - newEntropy print(" the gain of the %d feature is %.3f" % (I, infoGain)) if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(dataSet, labels, featLabels): ClassList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] featLabels.append(bestFeatLabel) myTree = {bestFeatLabel: {}} featValues = [example[bestFeat] for example in dataSet] uniqueVls = set(featValues) for value in uniqueVls: myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), labels, featLabels) return myTree def getNumLeafs(myTree): numLeafs = 0 firstStr = next(iter(myTree)) secondDict = myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': numLeafs += getNumLeafs(secondDict[key]) else: Return numLeafs def getTreeDepth(myTree): maxDepth = 0 firstStr = next(iter(myTree)) secondDict = myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': thisDepth = 1 + getTreeDepth(secondDict[key]) else: thisDepth = 1 if thisDepth > maxDepth: maxDepth = thisDepth return maxDepth def plotNode(nodeTxt, centerPt, parentPt, Arrow_args = dict(arrowstyle="<-") font = FontProperties(fName =r"c:\ Windows \ fonts.ttc ", size=14) createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow_args, FontProperties=font) def plotMidText(cntrPt, parentPt, txtString): XMid = (parentPt[0] -CNTRpt [0]) / 2.0 + cntrPt[0] yMid = (parentPt[1] -CNTRpt [1]) / 2.0 + cntrPt[1] createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30) def plotTree(myTree, parentPt, LeafNode = dict(boxstyle="sawtooth", fc="0.8") leafNode = dict(boxstyle="round4", Fc ="0.8") numLeafs = getNumLeafs(myTree) depth = getTreeDepth(myTree) firstStr = next(iter(myTree)) cntrPt = (plottree.xoff + (1.0 + float(numLeafs)) / 2.0 / plottree.totalw, plottree.yoff) plotMidText(cntrPt, parentPt, nodeTxt) plotNode(firstStr, cntrPt, parentPt, DecisionNode) secondDict = myTree[firstStr] plottree.yoff = Plottree.yoff-1.0 / plottree.totalD for Key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': plotTree(secondDict[key], cntrPt, str(key)) else: PlotTree. XOff = plotTree. XOff + 1.0 / plotTree. TotalW plotNode(secondDict[key], (plotTree. XOff, plotTree. leafNode) plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, STR (key) plotTree. YOff = plotTree. YOff + 1.0 / plotTree. TotalD def createPlot(inTree): facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) plotTree.totalW = float(getNumLeafs(inTree)) plotTree.totalD = float(getTreeDepth(inTree)) plotTree.xOff = 0.5 / plotTree. TotalW; PlotTree. YOff = 1.0 plotTree(inTree, (0.5, 1.0), ") plt.show() # featLabels = [] myTree = createTree(dataSet, labels, featLabels) print(myTree) createPlot(myTree)Copy the code

The visualization of the final decision tree is as follows:

The gain of eigenvalues is as follows: