生活随笔
收集整理的這篇文章主要介紹了
主流机器学习模型模板代码+经验分享[xgb, lgb, Keras, LR]
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
刷比賽利器,感謝分享的人。
摘要
最近打各種比賽,在這里分享一些General Model,稍微改改就能用的
環境: python 3.5.2
XGBoost調參大全:?http://blog.csdn.net/han_xiaoyang/article/details/52665396?
XGBoost 官方API:?
http://xgboost.readthedocs.io/en/latest//python/python_api.html
Preprocess
[python] view plain
copy ????import?pandas?as?pd??import?numpy?as?np??import?scipy?as?sp??????def?read_csv_file(f,?logging=False):??????print("==========讀取數據=========")??????data?=??pd.read_csv(f)??????if?logging:??????????print(data.head(5))??????????print(f,?"包含以下列")??????????print(data.columns.values)??????????print(data.describe())??????????print(data.info())??????return?data??
Logistic Regression
[python] view plain
copy ????import?pandas?as?pd??import?numpy?as?np??from?scipy?import?sparse??from?sklearn.preprocessing?import?OneHotEncoder??from?sklearn.linear_model?import?LogisticRegression??from?sklearn.preprocessing?import?StandardScaler??????df_train?=?pd.DataFrame()??df_test??=?pd.DataFrame()??y_train?=?df_train['label'].values??????ss?=?StandardScaler()??????????enc?=?OneHotEncoder()??feats?=?["creativeID",?"adID",?"campaignID"]??for?i,?feat?in?enumerate(feats):??????x_train?=?enc.fit_transform(df_train[feat].values.reshape(-1,?1))??????x_test?=?enc.fit_transform(df_test[feat].values.reshape(-1,?1))??????if?i?==?0:??????????X_train,?X_test?=?x_train,?x_test??????else:??????????X_train,?X_test?=?sparse.hstack((X_train,?x_train)),?sparse.hstack((X_test,?x_test))????????feats?=?["price",?"age"]??x_train?=?ss.fit_transform(df_train[feats].values)??x_test??=?ss.fit_transform(df_test[feats].values)??X_train,?X_test?=?sparse.hstack((X_train,?x_train)),?sparse.hstack((X_test,?x_test))??????lr?=?LogisticRegression()??lr.fit(X_train,?y_train)??proba_test?=?lr.predict_proba(X_test)[:,?1]??
LightGBM
1. 二分類
[python] view plain
copy import?lightgbm?as?lgb??import?pandas?as?pd??import?numpy?as?np??import?pickle??from?sklearn.metrics?import?roc_auc_score??from?sklearn.model_selection?import?train_test_split????print("Loading?Data?...?")??????train_x,?train_y,?test_x?=?load_data()??????X,?val_X,?y,?val_y?=?train_test_split(??????train_x,??????train_y,??????test_size=0.05,??????random_state=1,??????stratify=train_y???)????X_train?=?X??y_train?=?y??X_test?=?val_X??y_test?=?val_y????????lgb_train?=?lgb.Dataset(X_train,?y_train)??lgb_eval?=?lgb.Dataset(X_test,?y_test,?reference=lgb_train)????params?=?{??????'boosting_type':?'gbdt',??????'objective':?'binary',??????'metric':?{'binary_logloss',?'auc'},??????'num_leaves':?5,??????'max_depth':?6,??????'min_data_in_leaf':?450,??????'learning_rate':?0.1,??????'feature_fraction':?0.9,??????'bagging_fraction':?0.95,??????'bagging_freq':?5,??????'lambda_l1':?1,????????'lambda_l2':?0.001,????????'min_gain_to_split':?0.2,??????'verbose':?5,??????'is_unbalance':?True??}??????print('Start?training...')??gbm?=?lgb.train(params,??????????????????lgb_train,??????????????????num_boost_round=10000,??????????????????valid_sets=lgb_eval,??????????????????early_stopping_rounds=500)????print('Start?predicting...')????preds?=?gbm.predict(test_x,?num_iteration=gbm.best_iteration)????????threshold?=?0.5??for?pred?in?preds:??????result?=?1?if?pred?>?threshold?else?0??????importance?=?gbm.feature_importance()??names?=?gbm.feature_name()??with?open('./feature_importance.txt',?'w+')?as?file:??????for?index,?im?in?enumerate(importance):??????????string?=?names[index]?+?',?'?+?str(im)?+?'\n'??????????file.write(string)??
2. 多分類
[python] view plain
copy import?lightgbm?as?lgb??import?pandas?as?pd??import?numpy?as?np??import?pickle??from?sklearn.metrics?import?roc_auc_score??from?sklearn.model_selection?import?train_test_split????print("Loading?Data?...?")??????train_x,?train_y,?test_x?=?load_data()??????X,?val_X,?y,?val_y?=?train_test_split(??????train_x,??????train_y,??????test_size=0.05,??????random_state=1,??????stratify=train_y???)????X_train?=?X??y_train?=?y??X_test?=?val_X??y_test?=?val_y????????lgb_train?=?lgb.Dataset(X_train,?y_train)??lgb_eval?=?lgb.Dataset(X_test,?y_test,?reference=lgb_train)????params?=?{??????'boosting_type':?'gbdt',??????'objective':?'multiclass',??????'num_class':?9,??????'metric':?'multi_error',??????'num_leaves':?300,??????'min_data_in_leaf':?100,??????'learning_rate':?0.01,??????'feature_fraction':?0.8,??????'bagging_fraction':?0.8,??????'bagging_freq':?5,??????'lambda_l1':?0.4,??????'lambda_l2':?0.5,??????'min_gain_to_split':?0.2,??????'verbose':?5,??????'is_unbalance':?True??}??????print('Start?training...')??gbm?=?lgb.train(params,??????????????????lgb_train,??????????????????num_boost_round=10000,??????????????????valid_sets=lgb_eval,??????????????????early_stopping_rounds=500)????print('Start?predicting...')????preds?=?gbm.predict(test_x,?num_iteration=gbm.best_iteration)????????for?pred?in?preds:??????result?=?prediction?=?int(np.argmax(pred))??????importance?=?gbm.feature_importance()??names?=?gbm.feature_name()??with?open('./feature_importance.txt',?'w+')?as?file:??????for?index,?im?in?enumerate(importance):??????????string?=?names[index]?+?',?'?+?str(im)?+?'\n'??????????file.write(string)??
XGBoost
1. 二分類
[python] view plain
copy import?numpy?as?np??import?pandas?as?pd??import?xgboost?as?xgb??import?time??from?sklearn.model_selection?import?StratifiedKFold??????from?sklearn.model_selection?import?train_test_split??train_x,?train_y,?test_x?=?load_data()????????????X,?val_X,?y,?val_y?=?train_test_split(??????train_x,??????train_y,??????test_size=0.01,??????random_state=1,??????stratify=train_y??)??????xgb_val?=?xgb.DMatrix(val_X,?label=val_y)??xgb_train?=?xgb.DMatrix(X,?label=y)??xgb_test?=?xgb.DMatrix(test_x)????????params?=?{??????'booster':?'gbtree',??????????????????'objective':?'binary:logistic',??????'eval_metric':?'logloss',????????????'gamma':?0.1,????????'max_depth':?8,????????'alpha':?0,?????????'lambda':?10,????????'subsample':?0.7,????????'colsample_bytree':?0.5,????????'min_child_weight':?3,????????????????????????'silent':?0,????????'eta':?0.03,????????'seed':?1000,??????'nthread':?-1,????????'missing':?1,??????'scale_pos_weight':?(np.sum(y==0)/np.sum(y==1))??????????}??plst?=?list(params.items())??num_rounds?=?2000????watchlist?=?[(xgb_train,?'train'),?(xgb_val,?'val')]??????result?=?xgb.cv(plst,?xgb_train,?num_boost_round=200,?nfold=4,?early_stopping_rounds=200,?verbose_eval=True,?folds=StratifiedKFold(n_splits=4).split(X,?y))????????model?=?xgb.train(plst,?xgb_train,?num_rounds,?watchlist,?early_stopping_rounds=200)??model.save_model('../data/model/xgb.model')??????preds?=?model.predict(xgb_test)??????threshold?=?0.5??for?pred?in?preds:??????result?=?1?if?pred?>?threshold?else?0??
CatBoost
沒用過,聽老鐵說還行
Keras
1. 二分類
[python] view plain
copy import?numpy?as?np??import?pandas?as?pd??import?time??from?sklearn.model_selection?import?train_test_split??from?matplotlib?import?pyplot?as?plt????from?keras.models?import?Sequential??from?keras.layers?import?Dropout??from?keras.layers?import?Dense,?Activation??from?keras.utils.np_utils?import?to_categorical??????from?model.util?import?load_data?as?load_data_1??from?model.util_combine_train_test?import?load_data?as?load_data_2??from?sklearn.preprocessing?import?StandardScaler???from?sklearn.preprocessing?import?Imputer????print("Loading?Data?...?")????train_x,?train_y,?test_x?=?load_data()??????X_train?=?train_x.values??X_test??=?test_x.values??y?=?train_y????imp?=?Imputer(missing_values='NaN',?strategy='mean',?axis=0)??X_train?=?imp.fit_transform(X_train)????sc?=?StandardScaler()??sc.fit(X_train)??X_train?=?sc.transform(X_train)??X_test??=?sc.transform(X_test)??????model?=?Sequential()??model.add(Dense(256,?input_shape=(X_train.shape[1],)))??model.add(Activation('tanh'))??model.add(Dropout(0.3))??model.add(Dense(512))??model.add(Activation('relu'))??model.add(Dropout(0.3))??model.add(Dense(512))??model.add(Activation('tanh'))??model.add(Dropout(0.3))??model.add(Dense(256))??model.add(Activation('linear'))??model.add(Dense(1))???model.add(Activation('sigmoid'))??????model.compile(loss='binary_crossentropy',????????????????optimizer='rmsprop',????????????????metrics=['accuracy'])????epochs?=?100??model.fit(X_train,?y,?epochs=epochs,?batch_size=2000,?validation_split=0.1,?shuffle=True)??????threshold?=?0.5??for?index,?case?in?enumerate(X_test):??????case?=np.array([case])??????prediction_prob?=?model.predict(case)??????prediction?=?1?if?prediction_prob[0][0]?>?threshold?else?0?? 2. 多分類
[python] view plain
copy import?numpy?as?np??import?pandas?as?pd??import?time??from?sklearn.model_selection?import?train_test_split??from?matplotlib?import?pyplot?as?plt????from?keras.models?import?Sequential??from?keras.layers?import?Dropout??from?keras.layers?import?Dense,?Activation??from?keras.utils.np_utils?import?to_categorical??????from?model.util?import?load_data?as?load_data_1??from?model.util_combine_train_test?import?load_data?as?load_data_2??from?sklearn.preprocessing?import?StandardScaler???from?sklearn.preprocessing?import?Imputer????print("Loading?Data?...?")????train_x,?train_y,?test_x?=?load_data()??????X_train?=?train_x.values??X_test??=?test_x.values??y?=?train_y??????sc?=?StandardScaler()??sc.fit(X_train)??X_train?=?sc.transform(X_train)??X_test??=?sc.transform(X_test)??y?=?to_categorical(y)???????model?=?Sequential()??model.add(Dense(256,?input_shape=(X_train.shape[1],)))??model.add(Activation('tanh'))??model.add(Dropout(0.3))??model.add(Dense(512))??model.add(Activation('relu'))??model.add(Dropout(0.3))??model.add(Dense(512))??model.add(Activation('tanh'))??model.add(Dropout(0.3))??model.add(Dense(256))??model.add(Activation('linear'))??model.add(Dense(9))???model.add(Activation('softmax'))??????model.compile(optimizer='rmsprop',????????????????loss='categorical_crossentropy',????????????????metrics=['accuracy'])????epochs?=?200??model.fit(X_train,?y,?epochs=epochs,?batch_size=200,?validation_split=0.1,?shuffle=True)??????for?index,?case?in?enumerate(X_test):??????case?=?np.array([case])??????prediction_prob?=?model.predict(case)??????prediction?=?np.argmax(prediction_prob)??
處理正負樣本不均勻的案例
有些案例中,正負樣本數量相差非常大,數據嚴重unbalanced,這里提供幾個解決的思路
[python] view plain
copy ??positive_num?=?df_train[df_train['label']==1].values.shape[0]??negative_num?=?df_train[df_train['label']==0].values.shape[0]??print(float(positive_num)/float(negative_num))??
主要思路
1. 手動調整正負樣本比例
2. 過采樣 Over-Sampling
對訓練集里面樣本數量較少的類別(少數類)進行過采樣,合成新的樣本來緩解類不平衡,比如SMOTE算法
3. 欠采樣 Under-Sampling
4. 將樣本按比例一一組合進行訓練,訓練出多個弱分類器,最后進行集成
框架推薦
Github上大神寫的相關框架,專門用來處理此類問題:?
https://github.com/scikit-learn-contrib/imbalanced-learn
寫在最后
實踐永遠是檢驗真理的不二選擇
多打打比賽,對各種業務環境下的任務都能有所了解,也能學習新技術。
總結
以上是生活随笔為你收集整理的主流机器学习模型模板代码+经验分享[xgb, lgb, Keras, LR]的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。