【Python学习系列十八】基于scikit-learn库逻辑回归训练模型(delta比赛代码3)
生活随笔
收集整理的這篇文章主要介紹了
【Python学习系列十八】基于scikit-learn库逻辑回归训练模型(delta比赛代码3)
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
為了得到一致假設而使假設變得過度嚴格稱為過擬合。避免過擬合是分類器設計中的一個核心任務。通常采用增大數(shù)據(jù)量和測試樣本集的方法對分類器性能進行評價。由于比賽中樣本量是一致,目標測試集也是既定,所以我的思路是:先把過擬合特征做預訓練,然后放入重新訓練。參考代碼如下:
# -*- coding: utf-8 -*-import pandas as pd import time from sklearn import metrics from sklearn.linear_model import LogisticRegression from sklearn import preprocessing from sklearn.tree import DecisionTreeClassifier def main():#省份和地市映射data = {"province":['河北省', '山西省', '內(nèi)蒙古自治區(qū)', '遼寧省', '吉林省', '黑龍江省', '江蘇省', '浙江省', '安徽省', '福建省', '江西省', '山東省', '河南省', '湖北省', '湖南省', '廣東省', '廣西壯族自治區(qū)', '海南省', '四川省', '貴州省', '云南省', '西藏自治區(qū)', '陜西省', '甘肅省', '青海省', '寧夏回族自治區(qū)', '新疆維吾爾自治區(qū)', '北京市', '天津市', '上海市', '重慶市'],"pro_code":[13,14,15,21,22,23,32,33,34,35,36,37,41,42,43,44,45,46,51,52,53,54,61,62,63,64,65,11,12,31,50]}province = pd.DataFrame(data, columns = ["province", "pro_code"])citydata=pd.read_csv(r"D:\city.csv")#加載地市映射表#加載帶標記數(shù)據(jù)label_ds=pd.read_csv(r"D:\label.csv")label_ds = pd.merge(label_ds, province, how = "left", on = "province")label_ds = pd.merge(label_ds, citydata, how = "left", on = "city")label_df = pd.DataFrame(label_ds[['denomination','min_amount','pro_code','age','sex','account_age','txn_count','use_nums',\'txn_min_amount','txn_amount_mean','avg_discount','voucher_num','avg_txn_amt',\'use_ratio','voucher_ratio','batch_no','voucher_no','city_id','label']])label_df["denomination"] = label_df["denomination"].astype("int")label_df["min_amount"] = label_df["min_amount"].astype("int")label_df["pro_code"] = label_df["pro_code"].astype("int") label_df["age"] = label_df["age"].astype("int")label_df["sex"] = label_df["sex"].astype("int")label_df["account_age"] = label_df["account_age"].astype("int")label_df["txn_count"] = label_df["txn_count"].astype("int")label_df["use_nums"] = label_df["use_nums"].astype("int")label_df["txn_min_amount"] = label_df["txn_min_amount"].astype("int")label_df["txn_amount_mean"] = label_df["txn_amount_mean"].astype("int")label_df["avg_discount"] = label_df["avg_discount"].astype("int")label_df["voucher_num"] = label_df["voucher_num"].astype("int")label_df["avg_txn_amt"] = label_df["avg_txn_amt"].astype("int")label_df["use_ratio"] = label_df["use_ratio"].astype("float")label_df["voucher_ratio"] = label_df["voucher_ratio"].astype("float")label_df["batch_no"] = label_df["batch_no"].astype("int")label_df["voucher_no"] = label_df["voucher_no"].astype("str")label_df["city_id"] = label_df["city_id"].astype("int")label_df["label"] = label_df["label"].astype("int")#加載未標記數(shù)據(jù)unlabel_ds=pd.read_csv(r"D:\unlabel.csv")unlabel_ds = pd.merge(unlabel_ds, province, how = "left", on = "province")unlabel_ds = pd.merge(unlabel_ds, citydata, how = "left", on = "city")unlabel_df = pd.DataFrame(unlabel_ds[['denomination','min_amount','pro_code','age','sex','account_age','txn_count','use_nums',\'txn_min_amount','txn_amount_mean','avg_discount','voucher_num','avg_txn_amt',\'use_ratio','voucher_ratio','batch_no','city_id','phone','voucher_no']]) unlabel_df["denomination"] = unlabel_df["denomination"].astype("int")unlabel_df["min_amount"] = unlabel_df["min_amount"].astype("int") unlabel_df["pro_code"] = unlabel_df["pro_code"].astype("int") unlabel_df["age"] = unlabel_df["age"].astype("int")unlabel_df["sex"] = unlabel_df["sex"].astype("int")unlabel_df["account_age"] = unlabel_df["account_age"].astype("int")unlabel_df["txn_count"] = unlabel_df["txn_count"].astype("int")unlabel_df["use_nums"] = unlabel_df["use_nums"].astype("int")unlabel_df["txn_min_amount"] = unlabel_df["txn_min_amount"].astype("int")unlabel_df["txn_amount_mean"] = unlabel_df["txn_amount_mean"].astype("int")unlabel_df["avg_discount"] = unlabel_df["avg_discount"].astype("int")unlabel_df["voucher_num"] = unlabel_df["voucher_num"].astype("int")unlabel_df["avg_txn_amt"] = unlabel_df["avg_txn_amt"].astype("int")unlabel_df["use_ratio"] = unlabel_df["use_ratio"].astype("float")unlabel_df["voucher_ratio"] = unlabel_df["voucher_ratio"].astype("float")unlabel_df["batch_no"] = unlabel_df["batch_no"].astype("int")unlabel_df["city_id"] = unlabel_df["city_id"].astype("int")unlabel_df["phone"] = unlabel_df["phone"].astype("str")unlabel_df["voucher_no"] = unlabel_df["voucher_no"].astype("str") #預訓練開始#訓練數(shù)據(jù)采樣,80%訓練,20%驗證 print "總樣本,有", label_df.shape[0], "行", label_df.shape[1], "列"train_label_df=label_df.sample(frac=0.8) print "訓練集,有", train_label_df.shape[0], "行", train_label_df.shape[1], "列"test_label_df=label_df.sample(frac=0.2) print "驗證集,有", test_label_df.shape[0], "行", test_label_df.shape[1], "列"#模型訓練label_X = train_label_df[['voucher_num','use_nums','use_ratio','voucher_ratio','avg_discount','avg_txn_amt']]label_X = preprocessing.scale(label_X)#歸一化label_y = train_label_df['label']model =LogisticRegression()#DecisionTreeClassifier()model.fit(label_X, label_y) #模型驗證#expected = test_label_df['label']#predicted_X=test_label_df[['voucher_num','use_nums','use_ratio','voucher_ratio','avg_discount','avg_txn_amt']]#predicted_X=preprocessing.scale(predicted_X)#歸一化#predicted = model.predict(predicted_X)#f1_score = metrics.f1_score(expected, predicted) #模型評估#print f1_score #利用模型為未標記樣本打上標簽unlabel_X=unlabel_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]unlabel_X_Scale=unlabel_df[['voucher_num','use_nums','use_ratio','voucher_ratio','avg_discount','avg_txn_amt']]unlabel_X_Scale=preprocessing.scale(unlabel_X_Scale)#歸一化unlabel_y=model.predict(unlabel_X_Scale)out_y=pd.DataFrame(unlabel_y.reshape(-1,1),columns=['label'])unlabel_X_new=unlabel_X.join(out_y,how='left')label_df=label_df.append(unlabel_X_new)#構成新的訓練集#預訓練結束 #模型訓練和預測f1_score_old=float(0)#f1-scoref1_score=float(0.3)#高于全部設置1的分數(shù)outset=[]flag=int(1) label_df_cons=label_df#訓練樣本數(shù)不變while (f1_score-f1_score_old)>0.0001 :#迭代收斂到f1-score不再提升if flag==0 :#第一次訓練排除樣本數(shù)量帶來的問題f1_score_old=f1_score#訓練數(shù)據(jù)采樣,80%訓練,20%驗證 print "總樣本,有", label_df.shape[0], "行", label_df.shape[1], "列"train_label_df=label_df#全量訓練,ample(frac=0.8) print "訓練集,有", train_label_df.shape[0], "行", train_label_df.shape[1], "列"test_label_df=label_df_cons.sample(frac=0.3) #用訓練集來測試f1-scoreprint "驗證集,有", test_label_df.shape[0], "行", test_label_df.shape[1], "列"#模型訓練label_X = train_label_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]label_X = preprocessing.scale(label_X)#歸一化label_y = train_label_df['label']model = LogisticRegression()#DecisionTreeClassifier()#if flag==0 :# model = DecisionTreeClassifier()#邏輯回歸,第一次預訓練#else :# model = LogisticRegression()#決策樹model.fit(label_X, label_y)if flag==0 :#模型驗證,第一次訓練不評分expected = test_label_df['label']predicted_X=test_label_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]predicted_X=preprocessing.scale(predicted_X)#歸一化predicted = model.predict(predicted_X)f1_score = metrics.f1_score(expected, predicted) #模型評估print f1_scoreflag=int(0)if f1_score_old<f1_score :#為未標記樣本打上標記,然后加入訓練集unlabel_X=unlabel_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]unlabel_X_noScale=unlabel_Xunlabel_X=preprocessing.scale(unlabel_X)#歸一化unlabel_y=model.predict(unlabel_X)out_y=pd.DataFrame(unlabel_y.reshape(-1,1),columns=['label'])unlabel_X_new=unlabel_X_noScale.join(out_y,how='left')label_df=pd.DataFrame()#原樣本清空label_df=label_df_cons.append(unlabel_X_new)#構成新的訓練集else : #迭代訓練結束,輸出結果unlabel_X=unlabel_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]unlabel_info = unlabel_df[['phone','voucher_no']]unlabel_X=preprocessing.scale(unlabel_X)#歸一化unlabel_y=model.predict_proba(unlabel_X)[:,1]#預測返回概率值,通過概率值閾值選擇正例樣本out_y=pd.DataFrame(unlabel_y,columns=['prob']) #返回判定正例的比例outset=unlabel_info.join(out_y,how='left')#輸出結果outset["label"] = outset.apply(lambda x: 0 if x["prob"] <0.55 else 1, axis = 1)outset= outset[outset['label']==1] outset=outset[['phone','voucher_no','label']]outsetds=pd.DataFrame(outset)outsetds.to_csv('D:\gd_delta.csv',index=False,header=None)#輸出預測數(shù)據(jù)#評價f1#unlabel_X=pd.DataFrame(unlabel_X,columns=['pro_code','city_id','age','sex','account_age',\# 'txn_count','txn_amount_mean','txn_min_amount'])#print unlabel_X.head(5)#outset=unlabel_X.join(out_y,how='left')#輸出結果#outset["label"] = outset.apply(lambda x: 0 if x["prob"] <0.57 else 1, axis = 1)#expected = outset['label']#predicted_X=outset[['pro_code','city_id','age','sex','account_age',\# 'txn_count','txn_amount_mean','txn_min_amount']]#predicted_X=preprocessing.scale(predicted_X)#歸一化#predicted = model.predict(predicted_X)#f1_score = metrics.f1_score(expected, predicted) #模型評估#print f1_score#0.855946148093#退出循環(huán)break#執(zhí)行 if __name__ == '__main__': start = time.clock() main()end = time.clock() print('finish all in %s' % str(end - start))避免過擬合還可以嘗試用正則化來避免,正則化方法是指在進行目標函數(shù)或代價函數(shù)優(yōu)化時,在目標函數(shù)或代價函數(shù)后面加上一個正則項,一般有L1正則與L2正則等。這個要看scikit-learn模型參數(shù)或內(nèi)部算法邏輯。
總結
以上是生活随笔為你收集整理的【Python学习系列十八】基于scikit-learn库逻辑回归训练模型(delta比赛代码3)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【Python学习系列十七】基于scik
- 下一篇: websocket python爬虫_p