import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_scoretrain = pd.read_csv("./train_set.csv")
test = pd.read_csv("./test_set.csv")
train.info()<class'pandas.core.frame.DataFrame'>
RangeIndex:25317 entries,0 to 25316
Data columns (total 18 columns):# Column Non-Null Count Dtype ----------------------------0 ID 25317 non-null int64 1 age 25317 non-null int64 2 job 25317 non-null object3 marital 25317 non-null object4 education 25317 non-null object5 default 25317 non-null object6 balance 25317 non-null int64 7 housing 25317 non-null object8 loan 25317 non-null object9 contact 25317 non-null object10 day 25317 non-null int64 11 month 25317 non-null object12 duration 25317 non-null int64 13 campaign 25317 non-null int64 14 pdays 25317 non-null int64 15 previous 25317 non-null int64 16 poutcome 25317 non-null object17 y 25317 non-null int64
dtypes: int64(9),object(9)
memory usage:3.5+ MB
NO字段名稱數據類型字段描述
1
ID
Int
客戶唯一標識
2
age
Int
客戶年齡
3
job
String
客戶的職業
4
marital
String
婚姻狀況
5
education
String
受教育水平
6
default
String
是否有違約記錄
7
balance
Int
每年賬戶的平均余額
8
housing
String
是否有住房貸款
9
loan
String
是否有個人貸款
10
contact
String
與客戶聯系的溝通方式
11
day
Int
最后一次聯系的時間(幾號)
12
month
String
最后一次聯系的時間(月份)
13
duration
Int
最后一次聯系的交流時長
14
campaign
Int
在本次活動中,與該客戶交流過的次數
15
pdays
Int
距離上次活動最后一次聯系該客戶,過去了多久(999表示沒有聯系過)
16
previous
Int
在本次活動之前,與該客戶交流過的次數
17
poutcome
String
上一次活動的結果
18
y
Int
預測客戶是否會訂購定期存款業務
相關系數
abs(train.corr()['y']).sort_values(ascending=False)
y 1.000000
ID 0.556627
duration 0.394746
pdays 0.107565
previous 0.088337
campaign 0.075173
balance 0.057564
day 0.031886
age 0.029916
Name: y, dtype: float64
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.facecolor']=(1,1,1,1)# pycharm 繪圖白底,看得清坐標from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_scoretrain = pd.read_csv("./train_set.csv")
test = pd.read_csv("./test_set.csv")
# 本地測試,選模型from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_scorerf = RandomForestClassifier()
knn = KNeighborsClassifier()
lr = LogisticRegression()
svc = SVC(probability=True)
gbdt = GradientBoostingClassifier()models =[knn, lr, svc, rf, gbdt]
param_grid_list =[# knn[{'model__n_neighbors':[5,15,35,50,100],'model__leaf_size':[10,20,30,40,50]}],# lr[{'model__penalty':['l1','l2'],'model__C':[0.2,0.5,1,1.2,1.5],'model__max_iter':[10000]}],# svc[{'model__C':[0.2,0.5,1,1.2],'model__kernel':['rbf']}],# rf[{# 'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],'model__n_estimators':[200,250,300,330,350],'model__max_features':[20,30,40,50],'model__max_depth':[5,7]}],# gbdt[{'model__learning_rate':[0.1,0.5],'model__n_estimators':[130,200,300],'model__max_features':['sqrt'],'model__max_depth':[5,7],'model__min_samples_split':[500,1000,1200],'model__min_samples_leaf':[60,100],'model__subsample':[0.8,1]}],]for i, model inenumerate(models):pipe = Pipeline([('preparation', full_pipeline),('model', model)])grid_search = GridSearchCV(pipe, param_grid_list[i], cv=3,scoring='roc_auc', verbose=2, n_jobs=-1)grid_search.fit(train_part, train_part_y)print(grid_search.best_params_)final_model = grid_search.best_estimator_pred = final_model.predict_proba(valid_part)[:,1]# roc 必須使用概率預測print("auc score: ", roc_auc_score(valid_part_y, pred))
注意 AUC 評分標準 要使用predict_proba方法 !!!
Fitting 3 folds for each of 25 candidates, totalling 75 fits
{'model__leaf_size':20,'model__n_neighbors':50}
auc score:0.8212256518034133
Fitting 3 folds for each of 10 candidates, totalling 30 fits
{'model__C':1.2,'model__max_iter':10000,'model__penalty':'l2'}
auc score:0.9011510812019533
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'model__C':0.2,'model__kernel':'rbf'}
auc score:0.7192431208601267
Fitting 3 folds for each of 40 candidates, totalling 120 fits
{'model__max_depth':7,'model__max_features':20,'model__n_estimators':350}
auc score:0.913398647137746
Fitting 3 folds for each of 144 candidates, totalling 432 fits
{'model__learning_rate':0.1,'model__max_depth':7,'model__max_features':'sqrt','model__min_samples_leaf':60,'model__min_samples_split':500,'model__n_estimators':300,'model__subsample':1}
auc score:0.9299485084368806
可以看見 GBDT 梯度提升下降樹模型表現最好
2.4 網格/隨機搜索 參數+提交
微調參數列表,使用全部的訓練數據訓練,使用 RF 和 GBDT 模型 對測試集進行預測
網格搜索
# 全量訓練,網格搜索,提交
y_train = X_train['y']
X_train_ = X_train.drop(['y'], axis=1)select_model =[rf, gbdt]
param_grid_list =[# rf[{# 'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],'model__n_estimators':[250,300,350,400],'model__max_features':[7,8,10,15,20],'model__max_depth':[7,9,10,11]}],# gbdt[{'model__learning_rate':[0.03,0.05,0.1],'model__n_estimators':[200,300,350],'model__max_features':['sqrt'],'model__max_depth':[7,9,11],'model__min_samples_split':[300,400,500],'model__min_samples_leaf':[50,60,70],'model__subsample':[0.8,1,1.2]}],]for i, model inenumerate(select_model):pipe = Pipeline([('preparation', full_pipeline),('model', model)])grid_search = GridSearchCV(pipe, param_grid_list[i], cv=3,scoring='roc_auc', verbose=2, n_jobs=-1)grid_search.fit(X_train_, y_train)print(grid_search.best_params_)final_model = grid_search.best_estimator_pred = final_model.predict_proba(X_test)[:,1]# roc 必須使用概率預測print(model,'\n finished!')result = pd.DataFrame()result['ID']= test['ID']result['pred']= predresult.to_csv('{}_pred.csv'.format(i), index=False)
Fitting 3 folds for each of 80 candidates, totalling 240 fits
{'model__max_depth':11,'model__max_features':15,'model__n_estimators':400}RandomForestClassifier() finished!
Fitting 3 folds for each of 729 candidates, totalling 2187 fits
{'model__learning_rate':0.05,'model__max_depth':11,'model__max_features':'sqrt','model__min_samples_leaf':50,'model__min_samples_split':500,'model__n_estimators':300,'model__subsample':1}GradientBoostingClassifier() finished!