【机器学习——决策树】——两种方法实现,含模型的保存和调用
生活随笔
收集整理的這篇文章主要介紹了
【机器学习——决策树】——两种方法实现,含模型的保存和调用
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
目錄
1、ID3算法
2、使用sklearn API——模型保存和調(diào)用成功
1、ID3算法
以下實現(xiàn)了決策樹的創(chuàng)建、可視化繪制、決策樹的保存和調(diào)用
但是在利用決策樹進行預測的時候出現(xiàn)錯誤
分類代碼
#實用決策樹進行分類 def classify(inputTree, featLabels, testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabelUnboundLocalError: local variable 'classLabel' referenced before assignment?
暫時未解決
?完整代碼
from dataProcess import loaddatasets from math import log import operator import json import numpy as np from paths import abs_path##數(shù)據(jù)集 def createDataSet(xlsPath):"""創(chuàng)建數(shù)據(jù)集"""datas, labels = loaddatasets(xlsPath)labels = labels.reshape(-1, 1)dataSet = np.hstack((datas, labels))dataSet = dataSet.tolist()featureName = ['能效設計', '含油艙底水污染控制', '污油污染控制', '餐飲污水控制', '生活污水控制', '發(fā)動機排氣污染物控制','制冷劑', '滅火劑', '垃圾污染控制', '防止噪聲污染', '應用比例', '振動', '噪聲', '有害物質(zhì)的禁用和限用']# 返回數(shù)據(jù)集和每個維度的名稱return dataSet, featureName##分割數(shù)據(jù)集 def splitDataSet(dataSet, axis, value):"""按照給定特征劃分數(shù)據(jù)集:param axis:劃分數(shù)據(jù)集的特征的維度:param value:特征的值:return: 符合該特征的所有實例(并且自動移除掉這維特征)"""# 循環(huán)遍歷dataSet中的每一行數(shù)據(jù)retDataSet = []for featVec in dataSet:if featVec[axis] == value:reduceFeatVec = featVec[:axis] # 刪除這一維特征reduceFeatVec.extend(featVec[axis + 1:])retDataSet.append(reduceFeatVec)return retDataSetdef majorityCnt(classList):classCount = {}for vote in classList:# 統(tǒng)計classList中每個元素出現(xiàn)的次數(shù)if vote not in classCount.keys():classCount[vote] = 0classCount[vote] += 1sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 根據(jù)字典的值降序排序return sortedClassCount[0][0] # 返回classList中出現(xiàn)次數(shù)最多的元素##計算信息熵 # 計算的始終是類別標簽的不確定度 def calcShannonEnt(dataSet):"""計算訓練數(shù)據(jù)集中的Y隨機變量的香農(nóng)熵:param dataSet::return:"""numEntries = len(dataSet) # 實例的個數(shù)labelCounts = {}for featVec in dataSet: # 遍歷每個實例,統(tǒng)計標簽的頻次currentLabel = featVec[-1] # 表示最后一列# 當前標簽不在labelCounts map中,就讓labelCounts加入該標簽if currentLabel not in labelCounts.keys():labelCounts[currentLabel] = 0labelCounts[currentLabel] += 1shannonEnt = 0.0for key in labelCounts:prob = float(labelCounts[key]) / numEntriesshannonEnt -= prob * log(prob, 2) # log base 2return shannonEnt## 計算條件熵 def calcConditionalEntropy(dataSet, i, featList, uniqueVals):"""計算x_i給定的條件下,Y的條件熵:param dataSet: 數(shù)據(jù)集:param i: 維度i:param featList: 數(shù)據(jù)集特征列表:param unqiueVals: 數(shù)據(jù)集特征集合:return: 條件熵"""ce = 0.0for value in uniqueVals:subDataSet = splitDataSet(dataSet, i, value)prob = len(subDataSet) / float(len(dataSet)) # 極大似然估計概率ce += prob * calcShannonEnt(subDataSet) # ∑pH(Y|X=xi) 條件熵的計算return ce##計算信息增益 def calcInformationGain(dataSet, baseEntropy, i):"""計算信息增益:param dataSet: 數(shù)據(jù)集:param baseEntropy: 數(shù)據(jù)集中Y的信息熵:param i: 特征維度i:return: 特征i對數(shù)據(jù)集的信息增益g(dataSet | X_i)"""featList = [example[i] for example in dataSet] # 第i維特征列表uniqueVals = set(featList) # 換成集合 - 集合中的每個元素不重復newEntropy = calcConditionalEntropy(dataSet, i, featList, uniqueVals) # 計算條件熵,infoGain = baseEntropy - newEntropy # 信息增益 = 信息熵 - 條件熵return infoGain## 算法框架 def chooseBestFeatureToSplitByID3(dataSet):"""選擇最好的數(shù)據(jù)集劃分:param dataSet::return:"""numFeatures = len(dataSet[0]) - 1 # 最后一列是分類baseEntropy = calcShannonEnt(dataSet) # 返回整個數(shù)據(jù)集的信息熵bestInfoGain = 0.0bestFeature = -1for i in range(numFeatures): # 遍歷所有維度特征infoGain = calcInformationGain(dataSet, baseEntropy, i) # 返回具體特征的信息增益if (infoGain > bestInfoGain):bestInfoGain = infoGainbestFeature = ireturn bestFeature # 返回最佳特征對應的維度def createTree(dataSet, featureName, chooseBestFeatureToSplitFunc=chooseBestFeatureToSplitByID3):"""創(chuàng)建決策樹:param dataSet: 數(shù)據(jù)集:param featureName: 數(shù)據(jù)集每一維的名稱:return: 決策樹"""classList = [example[-1] for example in dataSet] # 類別列表if classList.count(classList[0]) == len(classList): # 統(tǒng)計屬于列別classList[0]的個數(shù)return classList[0] # 當類別完全相同則停止繼續(xù)劃分if len(dataSet[0]) == 1: # 當只有一個特征的時候,遍歷所有實例返回出現(xiàn)次數(shù)最多的類別return majorityCnt(classList) # 返回類別標簽bestFeat = chooseBestFeatureToSplitFunc(dataSet) # 最佳特征對應的索引bestFeatLabel = featureName[bestFeat] # 最佳特征myTree = {bestFeatLabel: {}} # map 結(jié)構(gòu),且key為featureLabeldel (featureName[bestFeat])# 找到需要分類的特征子集featValues = [example[bestFeat] for example in dataSet]uniqueVals = set(featValues)for value in uniqueVals:subLabels = featureName[:] # 復制操作myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)return myTree# 預測 #####################官方代碼(報錯)################################### def predict_(inputTree, featLabels, testVec): # 傳入?yún)?shù):決策樹,屬性標簽,待分類樣本global classLabelfirstStr = list(inputTree.keys())[0] # 樹根代表的屬性secondDict = inputTree[firstStr]# print(secondDict)featIndex = featLabels.index(firstStr) # 樹根代表的屬性,所在屬性標簽中的位置,即第幾個屬性for key in list(secondDict.keys()):if testVec[featIndex] == key:if type(secondDict[key]).__name__ == 'dict':classLabel = predict(secondDict[key], featLabels, testVec)else:classLabel = secondDict[key]return classLabel#######################自己代碼(未跑通)################################# def predict(tree,featureNames,testVec):''':param tree: 決策樹:param featureNames: 屬性名稱:param testVec: 待測試數(shù)據(jù)向量:return:'''def predict(secondTree,featureNames,testVec):global predict_label# 當下一個不是字典時,說明已經(jīng)到了決策樹葉節(jié)點if type(secondTree).__name__ != "dict":predict_label = secondTreereturn# 當下一個為字典,繼續(xù)遍歷elif type(secondTree).__name__ == "dict":# 根節(jié)點屬性名稱rootName = list(secondTree.keys())[0]# 獲取根節(jié)點屬性值rootValue = testVec[featureNames.index(rootName)]# 根據(jù)根節(jié)點屬性值選擇分支secondTree = secondTree[rootName][str(rootValue)]predict(secondTree,featureNames,testVec)global predict_label# 根節(jié)點屬性名稱rootName = list(tree.keys())[0]# 獲取根節(jié)點屬性值rootValue = testVec[featureNames.index(rootName)]# 根據(jù)根節(jié)點屬性值選擇分支secondTree = tree[rootName][str(rootValue)]predict_label = predict(secondTree,featureNames,testVec)print(predict_label)return predict_label# 計算模型準確率 def evalute(testDataSets, featureName, tree):'''測試準確率:param testDataList:待測試數(shù)據(jù)集:param testLabelList: 待測試標簽集:param tree: 訓練集生成的樹:return: 準確率'''testLabelList = [] # 類別列表testDataList = []for example in testDataSets:testDataList.append(example[:-1])testLabelList.append(example[-1])# 錯誤次數(shù)計數(shù)errorCnt = 0# 遍歷測試集中每一個測試樣本for i in range(len(testDataList)):# 判斷預測與標簽中結(jié)果是否一致if testLabelList[i] != predict(tree, featureName, testDataList[i]):errorCnt += 1# 返回準確率return 1 - errorCnt / len(testDataList)# 擴展json類方法,使得能夠存儲數(shù)組 class NpEncoder(json.JSONEncoder):def default(self, obj):if isinstance(obj, np.integer):return int(obj)elif isinstance(obj, np.floating):return float(obj)elif isinstance(obj, np.ndarray):return obj.tolist() # 其實就是將數(shù)組轉(zhuǎn)化成了列表進行保存else:return super(NpEncoder, self).default(obj)# 保存模型 def save_model(tree, model_save_path):json_str = json.dumps(tree, indent=4, cls=NpEncoder)with open(model_save_path, "w", encoding="utf-8") as f:f.write(json_str)# 加載模型 def load_model(model_path):# 加載模型with open(model_path, 'r', encoding="UTF-8") as f:tree = json.load(f)return tree###################################繪制################################################ import matplotlib.pyplot as plt# 定義文本框和箭頭格式 decisionNode = dict(boxstyle="round4", color='#3366FF') # 定義判斷結(jié)點形態(tài) leafNode = dict(boxstyle="circle", color='#FF6633') # 定義葉結(jié)點形態(tài) arrow_args = dict(arrowstyle="<-", color='g') # 定義箭頭# 繪制帶箭頭的注釋 def plotNode(nodeTxt, centerPt, parentPt, nodeType):createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',xytext=centerPt, textcoords='axes fraction',va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)# 計算葉結(jié)點數(shù) def getNumLeafs(myTree):numLeafs = 0firstStr = list(myTree.keys())[0]secondDict = myTree[firstStr]for key in secondDict.keys():if type(secondDict[key]).__name__ == 'dict':numLeafs += getNumLeafs(secondDict[key])else:numLeafs += 1return numLeafs# 計算樹的層數(shù) def getTreeDepth(myTree):maxDepth = 0firstStr = list(myTree.keys())[0]secondDict = myTree[firstStr]for key in secondDict.keys():if type(secondDict[key]).__name__ == 'dict':thisDepth = 1 + getTreeDepth(secondDict[key])else:thisDepth = 1if thisDepth > maxDepth:maxDepth = thisDepthreturn maxDepth# 在父子結(jié)點間填充文本信息 def plotMidText(cntrPt, parentPt, txtString):xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)def plotTree(myTree, parentPt, nodeTxt):numLeafs = getNumLeafs(myTree)depth = getTreeDepth(myTree)firstStr = list(myTree.keys())[0]cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)plotMidText(cntrPt, parentPt, nodeTxt) # 在父子結(jié)點間填充文本信息plotNode(firstStr, cntrPt, parentPt, decisionNode) # 繪制帶箭頭的注釋secondDict = myTree[firstStr]plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalDfor key in secondDict.keys():if type(secondDict[key]).__name__ == 'dict':plotTree(secondDict[key], cntrPt, str(key))else:plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalWplotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalDdef createPlot(inTree):fig = plt.figure(1, facecolor='white')fig.clf()axprops = dict(xticks=[], yticks=[])createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)plotTree.totalW = float(getNumLeafs(inTree))plotTree.totalD = float(getTreeDepth(inTree))plotTree.xOff = -0.5 / plotTree.totalWplotTree.yOff = 1.0plotTree(inTree, (0.5, 1.0), '')plt.show()###################################繪制################################################if __name__ == '__main__':# # 訓練模型# print("加載數(shù)據(jù)集")dataSets, featureName = createDataSet(abs_path + "\\data\\min_datas.xlsx")# print("創(chuàng)建決策樹")# mytree = createTree(dataSets, featureName)# print("決策樹:", mytree)# print("保存決策樹")# save_model(mytree, abs_path + "\\data\\decitionTree.json")# 評估模型print("評估決策樹:")print("調(diào)用決策樹模型")tree = load_model(abs_path + "\\data\\decitionTree.json")print(tree)acc = evalute(dataSets, featureName, tree)print("正確率為:%.2f" % acc)print("保存決策樹")save_model(tree, abs_path + "\\data\\decitionTree_%.2f.json" % (acc))# 利用模型預測testdata = [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]print("測試數(shù)據(jù):", testdata)print("真實標簽:", 0)predict_label = (tree, featureName, testdata)print("預測標簽:", predict_label)# ############################ 繪制決策樹############################### # from pylab import * # mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默認字體 # mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像時負號'-'顯示為方塊的問題 # # 繪制決策樹 # createPlot(tree) # ###################################################################2、使用sklearn API——模型保存和調(diào)用成功
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2022/1/4 11:57 # @Author : @linlianqin # @Site : # @File : test33.py # @Software: PyCharm # @description: from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_textdef storeTree(inputTree, filename):import picklefw = open(filename, 'wb')pickle.dump(inputTree, fw)fw.close()def grabTree(filename):import picklefr = open(filename,'rb')return pickle.load(fr)iris = load_iris()import numpy as np decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) decision_tree = decision_tree.fit(iris.data, iris.target) print(decision_tree)print("訓練得到的模型") for data in iris.data:data = data.reshape(1, -1)print(decision_tree.predict(data))breakstoreTree(decision_tree,'12.pkl') tree = grabTree('12.pkl') print("加載出來的模型") for data in iris.data:data = data.reshape(1,-1)print(decision_tree.predict(data))break print(tree) r = export_text(decision_tree, feature_names=iris['feature_names']) print(r)運行結(jié)果:
DecisionTreeClassifier(max_depth=2, random_state=0) 訓練得到的模型 [0] 加載出來的模型 [0] DecisionTreeClassifier(max_depth=2, random_state=0) |--- petal width (cm) <= 0.80 | |--- class: 0 |--- petal width (cm) > 0.80 | |--- petal width (cm) <= 1.75 | | |--- class: 1 | |--- petal width (cm) > 1.75 | | |--- class: 2注意:讀寫形式應該為wb,rb,不然會報錯
TypeError: write() argument must be str, not bytes def storeTree(inputTree, filename):import picklefw = open(filename, 'wb')pickle.dump(inputTree, fw)fw.close()def grabTree(filename):import picklefr = open(filename,'rb')return pickle.load(fr)代碼優(yōu)化后:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/12/31 20:35 # @Author : @linlianqin # @Site : # @File : decisionTree_.py # @Software: PyCharm # @description:# 實現(xiàn)決策樹分類,值得注意的是對于數(shù)據(jù)的處理from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_text from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_curve import picklefrom dataProcess import loaddatasets from paths import abs_path import numpy as np# 創(chuàng)建數(shù)據(jù)集 def createDataSet(xlsPath):datas, levels = loaddatasets(xlsPath)levels = levels.reshape(-1, 1)return datas,levels# 創(chuàng)建決策樹 def createTree(datas,labels):decision_tree = DecisionTreeClassifier()decision_tree = decision_tree.fit(datas, labels)return decision_tree# 預測 def predict(tree,testVec):predict_label = tree.predict(testVec)return predict_label[0]# 保存模型 def storeTree(inputTree, filename):fw = open(filename, 'wb')pickle.dump(inputTree, fw)fw.close()# 加載模型 def grabTree(filename):fr = open(filename,'rb')return pickle.load(fr)# 模型的評價 def model_s(y_predict, y_test):# acc = accuracy_score(y_test, y_predict)# precision = precision_score(y_test, y_predict,average='macro')# recall = recall_score(y_test, y_predict,average='macro')print("正確率為:", np.sum(y_predict == y_test) / len(y_test))print("準確率:", accuracy_score(y_test, y_predict))print("精確率:", precision_score(y_test, y_predict,average='macro'))print("查全率:", recall_score(y_test, y_predict,average='macro'))# return acc,precision,recall# 將樹寫入TXT中 def writeIntoTxt(tree,featureName,filename):r = export_text(tree, feature_names = featureName)with open(filename,'w') as f:f.writelines(r)if __name__ == '__main__':from paths import abs_pathxlsPath = abs_path + "\\data\\min_datas.xlsx"model_path = abs_path+"\\data\\decisionTree.pkl"txt_path = abs_path = abs_path + "\\data\\decisionTree.txt"testVec = [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0]testVec = np.array(testVec).reshape(1, -1)featureNames = ['energy', 'oil-1', 'oil-2', "water-1", "water-2", "air", "cold", "fire","pm", "rubbish", "noise", "posi", "CEAR", "vibration", "noise1", "bad"]print("testVec:",testVec)print("featureNames:\n",featureNames)print("load datasets")datas,labels = createDataSet(xlsPath)print("building Tree")mytree = createTree(datas,labels)print("mytree:\n",mytree)print('save model......')storeTree(mytree,model_path)print('test.......')predict_label = predict(mytree,testVec)print("predict label:",predict_label)print('load model......')loadTree = grabTree(model_path)print('test load tree......')predict_label = predict(loadTree,testVec)print("predict label:",predict_label)總結(jié)
以上是生活随笔為你收集整理的【机器学习——决策树】——两种方法实现,含模型的保存和调用的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Python_自定义关键字的使用
- 下一篇: selenium 验证码——万能码的使用