日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當(dāng)前位置: 首頁 > 编程资源 > 编程问答 >内容正文

编程问答

预测分析·民宿价格预测baseline

發(fā)布時間:2023/12/14 编程问答 33 豆豆
生活随笔 收集整理的這篇文章主要介紹了 预测分析·民宿价格预测baseline 小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

大家好,我是小澤
預(yù)測分析·民宿價格預(yù)測比賽是和鯨社區(qū)與ChallengeHub聯(lián)合舉辦的一場新手賽,本文旨在多角度構(gòu)建特征工程來幫助選手快速比賽上手。
比賽鏈接
話不多說,直接開!

導(dǎo)入相關(guān)庫

import time import lightgbm as lgb import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn import metrics from sklearn.model_selection import KFold from sklearn.preprocessing import LabelEncoder from catboost import CatBoostRegressor, Pool#讀取訓(xùn)練集與測試集并構(gòu)建原始數(shù)據(jù) train = pd.read_csv('./訓(xùn)練集.csv') test = pd.read_csv('./測試集.csv') df_features = train.append(test)

填充缺失值并且相關(guān)Encoding操作

df_features['洗手間數(shù)量'].fillna(-1, inplace=True) df_features['床的數(shù)量'].fillna(-1, inplace=True) df_features['臥室數(shù)量'].fillna(-1, inplace=True) df_features['房主是否有個人資料圖片'].fillna('na', inplace=True) df_features['房主身份是否驗證'].fillna('na', inplace=True) df_features['房主回復(fù)率'].fillna('-1', inplace=True) df_features['房主回復(fù)率'] = df_features['房主回復(fù)率'].astype(str).apply(lambda x: x.replace('%', '')) df_features['房主回復(fù)率'] = df_features['房主回復(fù)率'].astype(int) df_features['民宿周邊'].fillna('na', inplace=True) mean_score = df_features['民宿評分'].mean() df_features['民宿評分'].fillna(mean_score, inplace=True) df_features['郵編'].fillna('na', inplace=True)for feat in ['房主是否有個人資料圖片', '房主身份是否驗證', '民宿周邊', '郵編']:lbl = LabelEncoder()lbl.fit(df_features[feat])df_features[feat] = lbl.transform(df_features[feat])def freq_enc(df, col):vc = df[col].value_counts(dropna=True, normalize=True).to_dict()df[f'{col}_freq'] = df[col].map(vc)return dffor feat in ['容納人數(shù)', '洗手間數(shù)量', '床的數(shù)量', '床的類型','臥室數(shù)量', '取消條款', '所在城市', '清潔費','房主是否有個人資料圖片', '房主回復(fù)率', '是否支持隨即預(yù)訂','民宿周邊', '房產(chǎn)類型', '房型', '郵編']:df_features = freq_enc(df_features, feat)

對時間特征進行處理

# 時間特征處理 from tqdm import tqdm df_features['首次評論日期'] = pd.to_datetime(df_features['首次評論日期']).values.astype(np.int64) // 10 ** 9 df_features['何時成為房主'] = pd.to_datetime(df_features['何時成為房主']).values.astype(np.int64) // 10 ** 9 df_features['最近評論日期'] = pd.to_datetime(df_features['最近評論日期']).values.astype(np.int64) // 10 ** 9df_features['timestamp_diff1'] = df_features['首次評論日期'] - df_features['何時成為房主'] df_features['timestamp_diff2'] = df_features['最近評論日期'] - df_features['首次評論日期'] df_features['timestamp_diff3'] = df_features['最近評論日期'] - df_features['何時成為房主']def brute_force(df, features, groups):for method in tqdm(['max', 'min', 'mean', 'median', 'std']):for feature in features:for group in groups:df[f'{group}_{feature}_{method}'] = df.groupby(group)[feature].transform(method)return dfdense_feats = ['timestamp_diff1', 'timestamp_diff2', 'timestamp_diff3'] cate_feats = ['房型']df_features = brute_force(df_features, dense_feats, cate_feats)

其他簡單業(yè)務(wù)特征

def f(x):if x>0:return 1else:return 0 df_features['if_bed'] = train['床的數(shù)量'].apply(f) df_features['if_bedroom'] = train['臥室數(shù)量'].apply(f) df_features['if_wc'] = train['洗手間數(shù)量'].apply(f)#交叉衍生特征 df_features['人均床數(shù)量'] = df_features['容納人數(shù)'] / (df_features['床的數(shù)量'] + 1e-3) # 1e-3 是為了避免 zero-divide df_features['人均臥室量'] = df_features['容納人數(shù)'] / (df_features['臥室數(shù)量'] + 1e-3) df_features['臥室床均量'] = df_features['床的數(shù)量'] / (df_features['臥室數(shù)量'] + 1e-3) df_features['經(jīng)緯度平方根'] = (df_features['維度']*df_features['維度'] + df_features['經(jīng)度']*df_features['經(jīng)度'])**.5def get_features(df):features = [['人均床數(shù)量','人均臥室量'],['臥室床均量','人均臥室量']]for fea in features:df[f'{fea[0]}_{fea[1]}_std'] = df[fea].std(1)df[f'{fea[0]}_{fea[1]}_max'] = df[fea].max(1)df[f'{fea[0]}_{fea[1]}_min'] = df[fea].min(1)df[f'{fea[0]}_{fea[1]}_sub'] = df[fea[0]] - df[fea[1]]#df.loc[df[fea[0]] <= df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 0#df.loc[df[fea[0]] > df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 1 return dfdf_features = get_features(df_features)

對“便利設(shè)施”特征進行挖掘

from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD df_features['便利設(shè)施數(shù)量']=df_features['便利設(shè)施'].apply(lambda x:len(x.lstrip('{').rstrip('}').split(','))) df_features['便利設(shè)施'] = df_features['便利設(shè)施'].apply(lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace(':', '').replace(',', ' ')) # df_features['便利設(shè)施'] = df_features['便利設(shè)施'].str.lower()n_components = 12X = list(df_features['便利設(shè)施'].values) tfv = TfidfVectorizer(ngram_range=(1,1), max_features=10000) tfv.fit(X) X_tfidf = tfv.transform(X) svd = TruncatedSVD(n_components= n_components) svd.fit(X_tfidf) X_svd = svd.transform(X_tfidf)for i in range(n_components):df_features[f'便利設(shè)施_tfidf_{i}'] = X_svd[:, i]

獲取特征和標(biāo)簽數(shù)據(jù)

df_train = df_features[~df_features['價格'].isnull()] df_train = df_train.reset_index(drop=True) df_test = df_features[df_features['價格'].isnull()]no_features = ['數(shù)據(jù)ID', '價格', '便利設(shè)施'] # 輸入特征列 features = [col for col in df_train.columns if col not in no_features]X = df_train[features] # 訓(xùn)練集輸入 y = df_train['價格'] # 訓(xùn)練集標(biāo)簽 X_test = df_test[features] # 測試集輸入

五折Catboost模型

n_fold = 5 folds = KFold(n_splits=n_fold, shuffle=True, random_state=1314)oof = np.zeros(len(X)) prediction = np.zeros(len(X_test)) for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]y_train, y_valid = y[train_index], y[valid_index]cate_features=['房主是否有個人資料圖片','房主身份是否驗證','是否支持隨即預(yù)訂','房產(chǎn)類型','房型','if_bed','if_bedroom','if_wc']train_pool = Pool(X_train, y_train, cat_features=cate_features)eval_pool = Pool(X_valid, y_valid, cat_features=cate_features)cbt_model = CatBoostRegressor(iterations=10000, # 注:baseline 提到的分?jǐn)?shù)是用 iterations=60000 得到的,但運行時間有點久learning_rate=0.1, # 注:事實上好幾個 property 在 lr=0.1 時收斂巨慢。后面可以考慮調(diào)大eval_metric='SMAPE',use_best_model=True,random_seed=42,logging_level='Verbose',#task_type='GPU',devices='0',gpu_ram_part=0.5,early_stopping_rounds=400)cbt_model.fit(train_pool,eval_set=eval_pool,verbose=1000)y_pred_valid = cbt_model.predict(X_valid)y_pred = cbt_model.predict(X_test)oof[valid_index] = y_pred_valid.reshape(-1, )prediction += y_pred prediction /= n_foldfrom sklearn.metrics import mean_squared_error score = mean_squared_error(oof, df_train['價格'].values, squared=False) print(score)test['價格'] = prediction test[['數(shù)據(jù)ID', '價格']].to_csv('./sub_cat.csv'.format(score), index=None)

最后線上RMSE可以達到5.3以內(nèi),目前可以排到top10左右。
本文主要參考了官方的baseline以及恒哥的代碼思路

如果本文可以幫助到大家,歡迎點個關(guān)注!

總結(jié)

以上是生活随笔為你收集整理的预测分析·民宿价格预测baseline的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯,歡迎將生活随笔推薦給好友。