當前位置：首頁 > 编程语言 > python >内容正文

python

python建模全步骤

發布時間：2023/12/31 python 20 豆豆

生活随笔收集整理的這篇文章主要介紹了 python建模全步骤小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

文章目錄

引入包名
變量類型轉化
- 提取object變量
- 轉換百分號變量
- 標準化
缺失值處理
- 查看
- numeric
特征工程
- 下采樣
- 正則
- map 函數
- object
object編碼化
- 熱編碼
- 熱編碼Not sparse
- label_encoder
- 辨析
數據分類
合并數據
建模
決策樹可視化展示
特征重要性
均衡樣本
模型評價
- train test split
- 模型評價
- ROC
- oob
混淆矩陣confusion matrix
- 混淆矩陣標準化
成本矩陣cost matrix

引入包名

import matplotlib.pyplot as plt import numpy as np import os.path from sklearn.preprocessing import Imputer import csv import pandas as pd import warnings import seaborn as sns warnings.simplefilter("ignore") from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn.tree import export_graphviz from sklearn import tree from sklearn.tree import DecisionTreeClassifier import pydotplus from IPython.display import Image from sklearn.model_selection import train_test_splitimport plotly.graph_objects as go import matplotlib.pyplot as plt import plotly_express as px from sklearn import preprocessing

變量類型轉化

提取object變量

cat_cols = [col for col in X.columns.values if X[col].dtype == 'O']

轉換百分號變量

for i in data.columns:try:if data[i].dtype == 'O' and data[i][0][-1]=='%':print(i)data[i] = data[i].apply(lambda x:float(x[:-1]))except:continue

標準化

from sklearn import preprocessing scaler = MinMaxScaler() for i in data.columns:if data[i].dtype != 'O':data[i] = preprocessing.minmax_scale(data[i])

或者

x = preprocessing.scale(x)

缺失值處理

查看

missing_values_table(X)

numeric

num = X.drop(cat_cols,axis=1).fillna(X.mean())num = X.drop(cat_cols,axis=1).fillna(X.median())num = X.drop(cat_cols,axis=1).fillna(X.mode())

特征工程

下采樣

sub_sample

def lower_sample_data(df, class_):'''percent:多數類別下采樣的數量相對于少數類別樣本數量的比例'''data0 = df[df['retention'] == class_] # 將少數類別的樣本放在data0data1 = df[df['retention'] != class_] index = np.random.randint(len(data1), size= (len(df) - len(data1))) # 隨機給定下采樣取出樣本的序號lower_data1 = data1.iloc[list(index)] # 下采樣return(pd.concat([lower_data1, data0])) data = lower_sample_data(data,'lost') data['retention'].value_counts()

正則

# 只取數字 data['brand_version'] = data['brand'].apply(lambda x:re.findall(r'\d',x)[0] if re.findall(r'\d',x) else 'null') data['brand_version'] = data['brand_version'].apply(lambda x:int(x) if x!='null' else 'null') # 分類 data['brand_class'] = data['brand'].apply(lambda x:'小米' if x.find('小米') else('紅米' if x.find('紅米') else 'others') ) # 只取英文 uncn = re.compile(r'[\u0061-\u007a,\u0020]') data['brand_series'] = data['brand'].apply(lambda x:"".join(uncn.findall(x.lower()))) # 只取英文和數字 data['brand_detail'] = data['brand'].apply(lambda x:re.sub('[^\u0061-\u007a^a-z^A-Z^0-9]+', '', x))

map 函數

def price_map(x):if x=='0-600':y=1elif x=='600-1000':y=2elif x=='1000-1500':y=3elif x=='1500-2000':y=4elif x=='2000-3000':y=5elif x=='3000-4000':y=6else:y=7return ydata['price_band'] = data['price'].apply(lambda x:price_map(x))

object

X = X.fillna('missing')

object編碼化

熱編碼

熱編碼Not sparse

label_encoder

le = preprocessing.LabelEncoder() for col in cat_cols:cat_labelcoder[col] = le.fit_transform(cat_labelcoder[col].astype('str'))

辨析

理論上，將object變量進行label_encoder或者one_hot encoder都是一樣的，但是因為label encoder會將object賦予大小含義，切割特征時會按照numeric型變量進行切分；因此，如果每次賦值不同，那么每次決策樹的左右子樹的值就會不同，會導致結果不一致。
因此，一般而言，除了表示“不好，一般，好，很好”這種帶有賦值含義的object型變量可以根據label_encoder進行數據處理，其他情況請都用one_hot。

數據分類

x = data.drop(['id','retention'],axis=1)y = pd.DataFrame(data['retention'].apply(lambda x:1 if x=='lost' else 0))

合并數據

x_labelcoder = pd.concat([num,cat_labelcoder],axis=1)

建模

clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0) #擬合模型 clf.fit(x_onehot, y)

決策樹可視化展示

clf = tree.DecisionTreeClassifier(min_samples_split=0.1,max_depth=int(np.log2(x_onehot.shape[1])),random_state=0,class_weight='balanced') #擬合模 clf.fit(x_onehot, y) # extract single tree dot_data = tree.export_graphviz(clf, out_file=None,feature_names=x_onehot.columns,### 重點！！！class_names=data['tag'].unique(),filled=True, rounded=True,special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) #使用ipython的終端jupyter notebook顯示。 Image(graph.create_png())

特征重要性

clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0) #擬合模型 clf.fit(x_onehot, y) y_importances = clf.feature_importances_ x_importances = x_onehot.columns df = pd.DataFrame({'x':x_importances,'y':y_importances}).sort_values(by='y',ascending=False)px.bar_polar(df[:10], r="y", theta="x", color="x", template='plotly_white',color_discrete_sequence=px.colors.sequential.Plotly3[-2::-1])

均衡樣本

class_weight=‘balanced’

clf = RandomForestClassifier(n_estimators=10, criterion='gini',max_depth=10,bootstrap=True,random_state=0,class_weight='balanced') #擬合模型 clf.fit(x_onehot, y)

模型評價

train test split

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x_onehot, y, test_size=0.33, random_state=42)

模型評價

from sklearn.metrics import classification_reporty_predict = clf.predict(X_test)print(classification_report(y_test, y_predict))

ROC

# y_test：實際的標簽, dataset_pred：預測的概率值。 fpr, tpr, thresholds = roc_curve(y_test, y_predict) roc_auc = auc(fpr, tpr) #畫圖，只需要plt.plot(fpr,tpr),變量roc_auc只是記錄auc的值，通過auc()函數能計算出來 plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc)) plt.xlabel("FPR (False Positive Rate)") plt.ylabel("TPR (True Positive Rate)") plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc)) plt.show() roc_auc_score(y_test, y_predict)

oob

clf = RandomForestClassifier(n_estimators=100, criterion='gini',max_depth=x_onehot.shape[1],bootstrap=True,random_state=0,class_weight='balanced',oob_score=True) #擬合模型 clf.fit(x_onehot, y)clf.oob_score_

混淆矩陣confusion matrix

ax = sns.heatmap(confusion_matrix(y_test, y_predict),cmap='Blues',annot=True,fmt='g') plt.title('confusion matrix') plt.ylabel('True Lable') plt.xlabel('Predicted Lable')

混淆矩陣標準化

_ = confusion_matrix(y_test, y_predict)/np.sum(confusion_matrix(y_test, y_predict)) _ = np.around(_,decimals=2) ax = sns.heatmap(_,cmap='Blues',annot=True,fmt='g') plt.title('confusion matrix') plt.ylabel('True Lable') plt.xlabel('Predicted Lable')

成本矩陣cost matrix

cm = confusion_matrix(y_test, y_predict) # 0是流失，1是活躍 TP = cm[1][1] TN = cm[0][0] FP = cm[0][1]*5 FN = cm[1][0]*2 accuracy = round((TP+TN)/(TP+TN+FP+FN),2) recall = round(TP/(TP+FN),2) fscore = round(accuracy*recall/(accuracy+recall),2) cm_biz = np.vstack(([TN,FP],[FN,TP])) cm_biz = pd.DataFrame(cm_biz) ax = sns.heatmap(cm_biz,cmap='Blues',annot=True,fmt='g') plt.title('cost matrix'+'\n'+'accuracy= '+str(accuracy)+'\n'+'recall= '+str(recall)+'\n'+'f_score'+str(fscore)+'\n') plt.ylabel('True Lable') plt.xlabel('Predicted Lable')

總結

以上是生活随笔為你收集整理的python建模全步骤的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。