决策树ID3 C4.5 CART代码
生活随笔
收集整理的這篇文章主要介紹了
决策树ID3 C4.5 CART代码
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
ID3
# encoding: gbkimport pandas as pd import numpy as npclass DecisionTree:def __init__(self):self.model = None# 計算熵def calEntropy(self, y):valRate = y.value_counts().apply(lambda x: x / y.size) # 頻次匯總 得到各個特征對應的概率valEntropy = np.inner(valRate, np.log2(valRate)) * -1return valEntropydef fit(self, xTrain, yTrain=pd.Series([],dtype=pd.StringDtype())):if yTrain.size == 0: # 如果不傳進參數(shù)yTrain,自動選擇最后一列作為分類標簽yTrain = xTrain.iloc[:, -1]xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]self.model = self.buildDecisionTree(xTrain, yTrain)return self.modeldef buildDecisionTree(self, xTrain, yTrain):propNamesAll = xTrain.columns # 各屬性名# print(propNamesAll)yTrainCounts = yTrain.value_counts()if yTrainCounts.size == 1:# print('only one class', yTrainCounts.index[0])return yTrainCounts.index[0]entropyD = self.calEntropy(yTrain)maxGain = None # 最大信息增益maxEntropyPropName = None # 最大信息增益對應的屬性for propName in propNamesAll:propDatas = xTrain[propName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 頻次匯總 得到各個特征對應的概率sumEntropyByProp = 0for propClass, dvRate in propClassSummary.items():yDataByPropClass = yTrain[xTrain[propName] == propClass]entropyDv = self.calEntropy(yDataByPropClass)sumEntropyByProp += entropyDv * dvRategainEach = entropyD - sumEntropyByPropif maxGain is None or gainEach > maxGain:maxGain = gainEachmaxEntropyPropName = propName# print('select prop:', maxEntropyPropName, maxGain)propDatas = xTrain[maxEntropyPropName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 頻次匯總 得到各個特征對應的概率retClassByProp = {}for propClass, dvRate in propClassSummary.items():whichIndex = xTrain[maxEntropyPropName] == propClass # whichIndex: pd.Series()類型xDataByPropClass = xTrain[whichIndex]yDataByPropClass = yTrain[whichIndex]del xDataByPropClass[maxEntropyPropName] # 刪除已經(jīng)選擇的屬性列retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)return {'Node': maxEntropyPropName, 'Edge': retClassByProp}def predictBySeries(self, modelNode, data=pd.Series([],dtype=pd.StringDtype())):if not isinstance(modelNode, dict):return modelNodenodePropName = modelNode['Node'] # 節(jié)點所表示的屬性prpVal = data.get(nodePropName) # 待預測數(shù)據(jù)在該屬性上的取值for edge, nextNode in modelNode['Edge'].items():if prpVal == edge:return self.predictBySeries(nextNode, data)return Nonedef predict(self, data):if isinstance(data, pd.Series): # 如果只有一個數(shù)據(jù),則執(zhí)行。此時數(shù)據(jù)是Series,DataframeDataframe的一行數(shù)據(jù)是一個Seriesreturn self.predictBySeries(self.model, data)elif isinstance (data, pd.DataFrame): # 若是一組數(shù)據(jù),則執(zhí)行。 此時數(shù)據(jù)是Dataframereturn data.apply(lambda d: self.predictBySeries(self.model, d), axis=1) # axis-1: 對行應用函數(shù)data = pd.read_csv("xigua.csv", encoding="gbk")data_train = data.iloc[:,:-1] # 除去標簽 # print(pd.DataFrame(data_train))decisionTree = DecisionTree() treeData = decisionTree.fit(data) print("樹結構為:\n",treeData)print(pd.DataFrame({'預測值': decisionTree.predict(data_train), '真實標簽': data.iloc[:, -1]}))C4.5
# encoding: gbkimport pandas as pd import numpy as npclass DecisionTree:def __init__(self):self.model = None# 計算熵def calEntropy(self, y):valRate = y.value_counts().apply(lambda x: x / y.size) # 頻次匯總 得到各個特征對應的概率valEntropy = np.inner(valRate, np.log2(valRate)) * -1return valEntropydef fit(self, xTrain, yTrain=pd.Series([],dtype=pd.StringDtype())):if yTrain.size == 0: # 如果不傳進參數(shù)yTrain,自動選擇最后一列作為分類標簽yTrain = xTrain.iloc[:, -1]xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]self.model = self.buildDecisionTree(xTrain, yTrain)return self.modeldef buildDecisionTree(self, xTrain, yTrain):"""Attribute:propNamesAll: 所有屬性名maxGrain_ratio:最大信息增益率maxEntropyPropName:最大信息增益率對應的屬性propName:某個屬性名propClassSummary:某屬性的各個取值的頻率"""propNamesAll = xTrain.columns # 各屬性名# print(propNamesAll)yTrainCounts = yTrain.value_counts()if yTrainCounts.size == 1:# print('only one class', yTrainCounts.index[0])return yTrainCounts.index[0]entropyD = self.calEntropy(yTrain) # 熵maxGrain_ratio = None # 最大信息增益率maxEntropyPropName = None # 最大信息增益率對應的屬性for propName in propNamesAll:propDatas = xTrain[propName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 頻次匯總 得到各個特征對應的概率sumEntropyByProp = 0IV = 0for propClass, dvRate in propClassSummary.items(): # propClass:屬性的取值yDataByPropClass = yTrain[xTrain[propName] == propClass]entropyDv = self.calEntropy(yDataByPropClass) # 某屬性的某個取值的熵sumEntropyByProp += entropyDv * dvRateIV += np.inner(dvRate,np.log(dvRate))*-1gainEach = entropyD - sumEntropyByPropGrain_ratio = gainEach/IVif maxGrain_ratio is None or Grain_ratio > maxGrain_ratio:maxGrain_ratio = Grain_ratiomaxEntropyPropName = propName# print('select prop:', maxEntropyPropName, maxGain)propDatas = xTrain[maxEntropyPropName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 頻次匯總 得到各個特征對應的概率retClassByProp = {}for propClass, dvRate in propClassSummary.items():whichIndex = xTrain[maxEntropyPropName] == propClass # whichIndex: pd.Series()類型xDataByPropClass = xTrain[whichIndex]yDataByPropClass = yTrain[whichIndex]del xDataByPropClass[maxEntropyPropName] # 刪除已經(jīng)選擇的屬性列retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)return {'Node': maxEntropyPropName, 'Edge': retClassByProp}def predictBySeries(self, modelNode, data=pd.Series([],dtype=pd.StringDtype())):if not isinstance(modelNode, dict):return modelNodenodePropName = modelNode['Node'] # 節(jié)點所表示的屬性prpVal = data.get(nodePropName) # 待預測數(shù)據(jù)在該屬性上的取值for edge, nextNode in modelNode['Edge'].items():if prpVal == edge:return self.predictBySeries(nextNode, data)return Nonedef predict(self, data):if isinstance(data, pd.Series): # 如果只有一個數(shù)據(jù),則執(zhí)行。此時數(shù)據(jù)是Series,DataframeDataframe的一行數(shù)據(jù)是一個Seriesreturn self.predictBySeries(self.model, data)elif isinstance (data, pd.DataFrame): # 若是一組數(shù)據(jù),則執(zhí)行。 此時數(shù)據(jù)是Dataframereturn data.apply(lambda d: self.predictBySeries(self.model, d), axis=1) # axis-1: 對行應用函數(shù)data = pd.read_csv("xigua.csv", encoding="gbk")data_train = data.iloc[:,:-1] # 除去標簽 # print(pd.DataFrame(data_train))decisionTree = DecisionTree() treeData = decisionTree.fit(data) print("樹結構為:\n",treeData)print(pd.DataFrame({'預測值': decisionTree.predict(data_train), '真實標簽': data.iloc[:, -1]}))CART
# encoding: gbkimport pandas as pd import numpy as npclass DecisionTree:def __init__(self):self.model = None# 計算熵def Gini(self, y):valRate = y.value_counts().apply(lambda x: x / y.size) # 頻次匯總 得到各個特征對應的概率valGini = 1 - np.inner(valRate, valRate) * -1return valGinidef fit(self, xTrain, yTrain=pd.Series([],dtype=pd.StringDtype())):if yTrain.size == 0: # 如果不傳進參數(shù)yTrain,自動選擇最后一列作為分類標簽yTrain = xTrain.iloc[:, -1]xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]self.model = self.buildDecisionTree(xTrain, yTrain)return self.modeldef buildDecisionTree(self, xTrain, yTrain):propNamesAll = xTrain.columns # 各屬性名# print(propNamesAll)yTrainCounts = yTrain.value_counts()if yTrainCounts.size == 1:# print('only one class', yTrainCounts.index[0])return yTrainCounts.index[0]entropyD = self.Gini(yTrain)maxGini_index = None # 最大基尼指數(shù)maxGiniName = None # 最大基尼指數(shù)對應的屬性for propName in propNamesAll:propDatas = xTrain[propName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 頻次匯總 得到各個特征對應的概率sumGiniByProp = 0for propClass, dvRate in propClassSummary.items():yDataByPropClass = yTrain[xTrain[propName] == propClass]Gini = self.Gini(yDataByPropClass)sumGiniByProp += Gini * dvRateif maxGini_index is None or sumGiniByProp > maxGini_index:maxGini_index = sumGiniByPropmaxGiniName = propName# print('select prop:', maxGiniName, maxGain)propDatas = xTrain[maxGiniName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 頻次匯總 得到各個特征對應的概率retClassByProp = {}for propClass, dvRate in propClassSummary.items():whichIndex = xTrain[maxGiniName] == propClass # whichIndex: pd.Series()類型xDataByPropClass = xTrain[whichIndex]yDataByPropClass = yTrain[whichIndex]del xDataByPropClass[maxGiniName] # 刪除已經(jīng)選擇的屬性列retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)return {'Node': maxGiniName, 'Edge': retClassByProp}def predictBySeries(self, modelNode, data=pd.Series([],dtype=pd.StringDtype())):if not isinstance(modelNode, dict):return modelNodenodePropName = modelNode['Node'] # 節(jié)點所表示的屬性prpVal = data.get(nodePropName) # 待預測數(shù)據(jù)在該屬性上的取值for edge, nextNode in modelNode['Edge'].items():if prpVal == edge:return self.predictBySeries(nextNode, data)return Nonedef predict(self, data):if isinstance(data, pd.Series): # 如果只有一個數(shù)據(jù),則執(zhí)行。此時數(shù)據(jù)是Series,DataframeDataframe的一行數(shù)據(jù)是一個Seriesreturn self.predictBySeries(self.model, data)elif isinstance (data, pd.DataFrame): # 若是一組數(shù)據(jù),則執(zhí)行。 此時數(shù)據(jù)是Dataframereturn data.apply(lambda d: self.predictBySeries(self.model, d), axis=1) # axis-1: 對行應用函數(shù)data = pd.read_csv("xigua.csv", encoding="gbk")data_train = data.iloc[:,:-1] # 除去標簽 # print(pd.DataFrame(data_train))decisionTree = DecisionTree() treeData = decisionTree.fit(data) print("樹結構為:\n",treeData)print(pd.DataFrame({'預測值': decisionTree.predict(data_train), '真實標簽': data.iloc[:, -1]}))xigua.csv數(shù)據(jù)集
總結
以上是生活随笔為你收集整理的决策树ID3 C4.5 CART代码的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 作者:宋璇(1993-),女,食品安全大
- 下一篇: 作者:张群(1988-),女,博士,中国