當前位置：首頁 > 编程语言 > python >内容正文

python

用python-sklearn做广州房价预测——以此为例说明如何使用python做简单的数据分析

發布時間：2023/12/8 python 25 豆豆

生活随笔收集整理的這篇文章主要介紹了用python-sklearn做广州房价预测——以此为例说明如何使用python做简单的数据分析小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

0 數據

廣州市二手房價數據

大概有500條廣州市二手房價數據

python數據導入

import numpy as np import pandas as p #畫圖包導入 import matplotlib.pyplot as plt plt.style.use(style="ggplot") import missingno as msno import seaborn as snplt.rcParams['font.sans-serif'] = ['SimHei'] # 中文字體設置-黑體 plt.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示為方塊的問題 sns.set(font='SimHei',style="whitegrid",palette="binary") # 解決Seaborn中文顯示問題#讀取數據 train_names = ["總價（萬元）","均價（元/平方米）","房間數","大廳數","所在樓層","總樓層","朝向","房屋結構","裝修","面積（平方米）","建成時間","樓齡","所在區域"] train = pd.read_csv("data_guangzhou.csv",names=train_names,encoding='gb2312') #train = train.drop(0) #train = train.dropna()#直接讀取的數據是文本類型，改為數字類型 train['總價（萬元）'] = pd.to_numeric(train['總價（萬元）']) train['均價（元/平方米）'] = pd.to_numeric(train['均價（元/平方米）']) train['面積（平方米）'] = pd.to_numeric(train['面積（平方米）']) train['房間數'] = pd.to_numeric(train['房間數']) train['大廳數'] = pd.to_numeric(train['大廳數']) train['總樓層'] = pd.to_numeric(train['總樓層']) train['樓齡'] = pd.to_numeric(train['樓齡'])

1 數據分析

房價分布

plt.figure(figsize = (10,5)) print("skew: ",train["總價（萬元）"].skew()) sns.distplot(train["總價（萬元）"],color="b") plt.savefig('總價（萬元）.png', dpi=200,bbox_inches = 'tight')#指定分辨率 plt.show()

#經過log變換之后的價格分布 target = np.log(train["總價（萬元）"]) target=target.rename("總價（萬元）的對數") plt.figure(figsize = (10,5)) sns.distplot(target,color="b")#利用seaborn庫繪制 plt.savefig('總價（萬元）-log.png', dpi=200,bbox_inches = 'tight')#指定分辨率在這里插入代碼片

不同因素對房價的影響

g = sns.barplot(x="所在區域",y="總價（萬元）",data=train,color="b",order = ["天河","荔灣","越秀","黃埔","海珠","白云","番禺","南沙","增城","花都","從化"]) plt.savefig('所在區域-總價.png', dpi=200,bbox_inches = 'tight')#指定分辨率 plt.show()

plt.figure() fig1 = sns.jointplot(x="面積（平方米）",y="總價（萬元）",data=train,color="b") plt.savefig('面積-總價.png', dpi=200,bbox_inches = 'tight')#指定分辨率

plt.figure() fig1 = sns.barplot(x="房屋結構",y="總價（萬元）",data=train,color="b") plt.savefig('房屋結構-總價（萬元）.png', dpi=200,bbox_inches = 'tight')#指定分辨率

sns.barplot(x="裝修",y="均價（元/平方米）",data=train,color="b") plt.savefig('裝修-總價.png', dpi=200,bbox_inches = 'tight')#指定分辨率

2 數據變換

有一些因素無法直接作為輸入變量輸入到數學模型中，需要進行編碼。如裝修情況、房屋結構等。
使用獨熱編碼對這些因素進行處理。編碼前，樓層為“中”、“低”、“高”的文字描述，編碼后：

朝向-東西南北：
東南朝向：東1南1西0北0

獨熱編碼的代碼：

floor = pd.get_dummies(train["所在樓層"]) structure = pd.get_dummies(train["房屋結構"]) fitment = pd.get_dummies(train["裝修"]) location = pd.get_dummies(train["所在區域"]toward = pd.DataFrame(np.zeros((446,4)),columns=["東","西","南","北"]) i = 0 for index,row in train.iterrows():# print(row["朝向"]) if "東" in row["朝向"]:toward.loc[i,"東"] = 1if "西" in row["朝向"]:toward.loc[i,"西"] = 1if "南" in row["朝向"]:toward.loc[i,"南"] = 1if "北" in row["朝向"]:toward.loc[i,"北"] = 1i = i + 1test = pd.concat([train["總價（萬元）"],train["房間數"],train["大廳數"], floor,train["總樓層"],toward,structure,fitment,train["面積（平方米）"],train["樓齡"],location],axis=1)

3 相關性分析

plt.figure() corrMat = test[test.columns].corr() mask = np.array(corrMat) mask[np.tril_indices_from(mask)] = False plt.subplots(figsize=(20,10)) plt.xticks(rotation=60)#設置刻度標簽角度 fig1 = sns.heatmap(corrMat, mask=mask,vmax=.8, square=True,annot=True) plt.savefig('相關性矩陣.png', dpi=200,bbox_inches = 'tight')#指定分辨率print(corrMat["總價（萬元）"].sort_values(ascending=False))

各因素和房價的皮爾遜相關系數：

fig1 = corrMat["總價（萬元）"][1:31].plot(kind="barh",color='b',fontsize=12,figsize=(10,8)) plt.savefig('相關性.png', dpi=200,bbox_inches = 'tight')#指定分辨率

4 房價預測模型

from sklearn import preprocessing from sklearn import linear_model, svm, gaussian_process from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression #from sklearn.cross_validation import train_test_split from sklearn.model_selection import train_test_split from sklearn.neural_network import MLPRegressor import warnings warnings.filterwarnings('ignore')cols = test.columns[1:31] x = test.loc[:,cols].values y = test['總價（萬元）'].values x_scaled = preprocessing.MinMaxScaler().fit_transform(x) y_scaled = preprocessing.MinMaxScaler().fit_transform(y.reshape(-1,1)) X_train,X_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.02, random_state=42)clfs = {'支持向量機':svm.SVR(), '隨機森林':RandomForestRegressor(),'貝葉斯嶺回歸':linear_model.BayesianRidge(),} for clf in clfs:try:clfs[clf].fit(X_train, y_train)y_pred = clfs[clf].predict(X_test)print(clf + " cost:" + str(np.sum(abs((y_pred-y_test.reshape(len(y_pred)))/y_test.reshape(len(y_pred))))/len(y_pred)))except Exception as e:print(clf + " Error:")print(str(e))

模型訓練結束之后，看在訓練集上的結果

i = 1 for clf in clfs:y_pred = clfs[clf].predict(X_test)plt.subplot(3,1,i)plt.subplots_adjust(hspace=0.8,wspace=0.5)plt.plot(y_test.reshape(len(y_pred)),color = "r",linestyle='-',marker = '+',markersize = 2,linewidth=0.5)plt.plot(clfs[clf].predict(X_test),color='b',linestyle='-',marker = 'o',markersize = 2,linewidth=0.5)i = i + 1plt.title(clf + " cost:" + (str(np.sum(abs((y_pred-y_test.reshape(len(y_pred)))/y_test.reshape(len(y_pred))))/len(y_pred)))[0:4]) fig1.set_xticklabels(fig1.get_xticklabels(), rotation=60) plt.savefig('結果-1.png', dpi=200,bbox_inches = 'tight')#指定分辨率

隨機森林模型誤差最小。

5 總結

最重要的不是最后訓練模型，而是在訓練模型之前做的數據分析工作。對不同因素進行分析，特征處理，做相關性分析等。

請給我點一個贊~

總結

以上是生活随笔為你收集整理的用python-sklearn做广州房价预测——以此为例说明如何使用python做简单的数据分析的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： [数据结构与算法]输出1~10000中的
下一篇： python十进制转化为二进制