當(dāng)前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

房价数据爬取及分析

發(fā)布時間：2023/12/20 编程问答 26 豆豆

生活随笔收集整理的這篇文章主要介紹了房价数据爬取及分析小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

房價數(shù)據(jù)爬取及分析

1、爬取URL
2、URL爬取房屋信息
3、數(shù)據(jù)處理
4、相關(guān)分析
5、主成分分析
結(jié)果圖片

上傳代碼只為記錄，有待提升

1、爬取URL

爬取所有URL

1、獲取區(qū)域連接
2、區(qū)域連接獲各道路連接
3、道路連接如果大于5頁，就獲取價位的信息

獲取的所有URL存入數(shù)據(jù)中
等級編號類別 url
1 市 URL
1-1 區(qū) URL 1
1-1-1 路 URL 2
1-1-1-1 價格 URL 3

import requests from bs4 import BeautifulSoup import urllib3 urllib3.disable_warnings() #關(guān)閉HTTPS的警告 from sqlalchemy import create_engine import pandas as pddef gethtml(url):headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}try:r=requests.get(url,headers=headers,timeout=3,verify=False)r.raise_for_status()r.encoding='utf8'return r.textexcept Exception as e:print('error of ' , e.args)return ''def get_url(url,list_url,M):html=gethtml(url[1])demo=BeautifulSoup(html,'html.parser')div=demo.find_all('div',attrs={'class':'items'})try:if M==1:hf=div[0].find_all('a') #區(qū)域elif M==2:hf=div[0].find('div',attrs={'class':'sub-items'}).find_all('a') #道路URLelif M==3:hf=div[1].find_all('a') #價位elif M==4:hf=div[2].find_all('a') #面積elif M==5:hf=div[3].find_all('a') #房型else:passexcept Exception as e:print('error',e.args)#得到頁數(shù)page=999try:page=len(demo.find('div',attrs={'class':'sale-left'}).find('div' ,attrs={'class':'multi-page'}).find_all(['a','i']))-2except:try:num=len(demo.find('div',attrs={'class':'sale-left'}).find('ul',attrs={'id':'houselist-mod-new'}).find_all('a')) #篩選后展示個數(shù)if num==0:page=0else:page=1except:page=0if page>0: try:count=0for i in hf:count=count+1list_url[i.attrs['href']]=(i.text,i.attrs['href'],url[2]+'-'+str(count),page)except Exception as e:print('error',e.args)def save_mysql(data,table_url='hours_url_6'):conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #創(chuàng)建連接df=pd.DataFrame(data).Tdf.columns=['name','url','num','page'] pd.io.sql.to_sql(df,table_url,con=conn,if_exists = 'append',index=False) #導(dǎo)入數(shù)據(jù)庫if_exists = 'replace','append'conn.dispose()def main(): # item={'區(qū)域':0,'售價':1,'面積':2,'房型':3}#list_url={key,[title,url,url_no,page]} key=urlurl='https://xm.anjuke.com/sale/'url_no='1' #url等級編號ls_1={}list_url={}get_url(['城市',url,url_no],list_url,1) #獲取區(qū)域save_mysql(list_url)ls=list_url.copy()for depth in range(2,6):list_url.clear()ls_1.clear()count=0for i in ls:count+=1url=ls[i]get_url(url,list_url,depth)if len(list_url)>0:list_url['get_id']=Nonesave_mysql(list_url,'hours_url_8')for k in list_url:if list_url[k][3]>5:ls_1[k]=list_url[k]list_url.clear()print('\r','當(dāng)前深度：%s, 獲取進(jìn)度：%s/%s' % (depth,count,len(ls)),end='')print('\r','當(dāng)前進(jìn)度：%s ，獲取總數(shù):%s' % (depth,len(ls_1)))ls=ls_1.copy()def main_1():list_url={#'https://xm.anjuke.com/sale/siming/': ('思明', 'https://xm.anjuke.com/sale/siming/', '1-1', 8), #'https://xm.anjuke.com/sale/huli/': ('湖里', 'https://xm.anjuke.com/sale/huli/', '1-2', 8), #'https://xm.anjuke.com/sale/jimei/': ('集美', 'https://xm.anjuke.com/sale/jimei/', '1-3', 8), #'https://xm.anjuke.com/sale/haicang/': ('海滄', 'https://xm.anjuke.com/sale/haicang/', '1-4', 8), #'https://xm.anjuke.com/sale/tongana/': ('同安', 'https://xm.anjuke.com/sale/tongana/', '1-5', 8), 'https://xm.anjuke.com/sale/xiangana/': ('翔安', 'https://xm.anjuke.com/sale/xiangana/', '1-6', 8), 'https://xm.anjuke.com/sale/xiamenzhoubian/': ('廈門周邊', 'https://xm.anjuke.com/sale/xiamenzhoubian/', '1-7', 8)}ls=list_url.copy()ls_1={}list_url.clear()for depth in range(2,6):list_url.clear()ls_1.clear()count=0for i in ls:count+=1url=ls[i]get_url(url,list_url,depth)if len(list_url)>0:save_mysql(list_url,'hours_url_9')for k in list_url:if list_url[k][3]>5:ls_1[k]=list_url[k]list_url.clear()print('\r','當(dāng)前深度：%s, 獲取進(jìn)度：%s/%s' % (depth,count,len(ls)),end='')print('\r','當(dāng)前進(jìn)度：%s ，獲取總數(shù):%s' % (depth,len(ls_1)))ls=ls_1.copy() main()

2、URL爬取房屋信息

從ID獲取房源的數(shù)據(jù)
查看是否已獲取數(shù)據(jù)

import pandas as pd import pymysql from bs4 import BeautifulSoup import requests import re from sqlalchemy import create_engine from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning)def get_url(url):try:headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}req=requests.get(url,headers=headers,timeout=3,verify=False)req.raise_for_status()req.encoding='utf-8'return req.textexcept Exception as e:error_no=req.status_codeif error_no==404 or error_no==503:return error_noelse:print('error',e.args)return 1def html_demo(html,list_data):pat=re.compile(r'[\：]')demo=BeautifulSoup(html,'html.parser')div=demo.find('div',attrs={'class':'wrapper'})#房產(chǎn)信息li=demo.find_all('li',attrs={'class':'houseInfo-detail-item'})for i in li:div=i.find_all('div')count=1for j in div:if count==1:a=''.join(pat.sub('',j.text.strip()).split())elif count==2:b=''.join(pat.sub('',j.text.strip()).split())list_data[a]=belse:continuecount+=1 # 銷售信息demo.find('div',attrs={'class':'broker-wrap'})b_info=demo.find('div',attrs={'class':'broker-wrap'})brokercard_name=b_info.find('div',attrs={'class':'brokercard-name'}).text.split()[0]list_data['銷售']=brokercard_name#評分em=b_info.find_all('em')for i in range(3):list_data[pat.sub('',em[i*2].text)]=em[i*2+1].text#公司名稱try:gs_name=b_info.find('a',attrs={'class':'text-overflow'}).attrs['title'].split(':')list_data[gs_name[0]]=gs_name[1]except:passdef save_mysql(df,database='hours_data'):conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #創(chuàng)建連接pd.io.sql.to_sql(df,database,con=conn,if_exists = 'append',index=False) #導(dǎo)入數(shù)據(jù)庫if_exists = 'replace','append'conn.dispose()def update_id(id_num,database='hours_url_id'):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')cursor=conn.cursor()sql="update %s set get_id=1 where ID='%s'" %(database,id_num)cursor.execute(sql)conn.commit()cursor.close()conn.close()def get_id(id_table):conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')cursor=conn.cursor()sql='select distinct ID from %s where get_id is null '% id_tablecursor.execute(sql)data_id=cursor.fetchall()return data_idcursor.close()conn.close()def data_exists(table_name,url_no):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')cursor=conn.cursor()sql='select ID from %s where ID="%s"' %(table_name,url_no) cursor.execute(sql)return cursor.arraysizedef main():data=get_id('hours_url_id')start_url='https://xm.anjuke.com/prop/view/'list_data={}count=0n=len(data)for id in data:count+=1print('\r','獲取數(shù)據(jù)進(jìn)度：%s/%s' % (count,n),end='')count_id=data_exists('hours_url_id',id[0])if count_id >= 1:update_id(id[0])continuetry:url=start_url+id[0]html=get_url(url)if html==1: #其他異常跳過continueelif html==404: #404異常刪除passelif html==503:print('網(wǎng)頁需驗(yàn)證：503')breakelse:html_demo(html,list_data)list_data['ID']=id[0]df=pd.DataFrame(list_data,index=id)save_mysql(df)update_id(id[0])except Exception as e:count+=1print('error',url,e.args)continuemain()

3、數(shù)據(jù)處理

hours的數(shù)據(jù)處理部分

import numpy as np import pandas as pd import pymysqldef get_hours_data(): conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')sql='select * from %s ' % 'hours_data'df=pd.read_sql(sql,conn,index_col='ID')conn.close()return dfdef hours_data_extract(df):df.drop_duplicates(keep='first',inplace=True)df=df[~df.index.duplicated()]if df['建筑面積'].str[-3:].drop_duplicates(keep='first').count()>1:print('建筑面積異常,存在單位不統(tǒng)一')if df['房屋單價'].str[-4:].drop_duplicates(keep='first').count()>1:print('房屋單價異常,存在單位不統(tǒng)一')#錯誤數(shù)據(jù)修復(fù)df=df[(df.所在樓層.str[:1]!='共') & (df.所在樓層.str[:1]!='地')] # df.所在樓層=df.所在樓層.str.replace(['共90層','共74層'],['共9層','共7層'])#數(shù)據(jù)分類處理df.建筑面積=df.建筑面積.str[:-3].astype(np.float)df.房屋單價=df.房屋單價.str[:-4].astype(np.int)df.建造年代=df.建造年代.str[:-1].astype(np.int)df.所在位置=df.所在位置.str.split('－').str[0].replace(['思明','湖里','集美','海滄','同安','翔安','廈門周邊'],[1,2,3,4,5,6,7])df.房屋類型=df.房屋類型.replace(['普通住宅','平房','公寓','暫無'],[1,2,3,4])df[['室', '廳', '衛(wèi)']] = df['房屋戶型'].str.extract('(\d+)室(\d+)廳(\d+)衛(wèi)', expand=False).astype(np.int)df['均面積']=df['建筑面積']/(df['室']+df['廳'])df['樓高']=df.loc[:,'所在樓層'].str[4:].str[:-2].astype(np.int)df.樓高=df.樓高.replace([90,74],[9,7])df['層級']=df.loc[:,'所在樓層'].str[:2].replace(['低層','中層','高層'],[1,2,3]).astype(np.int)df.裝修程度=df.裝修程度.replace(['精裝修','簡單裝修','豪華裝修','毛坯'],[1,2,3,4])df.產(chǎn)權(quán)年限=df.產(chǎn)權(quán)年限.replace(['70年','50年','40年'],[1,2,3])df.配套電梯=df.配套電梯.replace(['無','有'],[0,1])df.房本年限=df.房本年限.replace(['滿五年','滿二年','不滿二年','暫無'],[1,2,3,4])df.產(chǎn)權(quán)性質(zhì)=df.產(chǎn)權(quán)性質(zhì).replace(['商品房住宅','商住兩用','單位集體自建房','使用權(quán)','保障性住房','動遷配套房','其他'],[1,2,3,4,5,6,7])df.唯一住房=df.唯一住房.replace(['是','否','暫無'],[1,0,0])df.房屋朝向=df.房屋朝向.replace(['東','南','西','北','東西','南北','東南','東北','西南','西北'],[1,2,3,4,5,6,7,8,9,10])df['price_group']=df.房屋單價//10000#刪除不需要字段df=df.drop(['所屬小區(qū)','房屋戶型','參考月供','所在樓層','參考首付','銷售','房源','服務(wù)','評價','工商注冊名稱','一手房源'],axis=1)return dfdef getdata():df=get_hours_data()data=hours_data_extract(df)return data

4、相關(guān)分析

相關(guān)性分析
分析數(shù)據(jù)的相關(guān)性

import hoursdata import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn import preprocessingdf=hoursdata.getdata()#print(df.count()) df=df[(df.產(chǎn)權(quán)性質(zhì)==1) & (df.房屋類型==1)]#print(df.drop_describe())#df[['所在位置']]=df[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform) #df[['房屋朝向']]=df[['房屋朝向']].apply(preprocessing.LabelEncoder().fit_transform)print(df[['均面積','建筑面積']].describe()) df_des=df.均面積.describe([.10,.90])df_1=df[(df['均面積']>=df_des['10%']) & (df['均面積']<=df_des['90%'])].copy() #df_1=df[(df['建筑面積']>=df_des[4]) & (df['建筑面積']<=df_des[6])].copy() print('****************\n',df_1[['均面積','建筑面積']].describe())#plt.show() plt.hist(df_1.建筑面積,bins=np.linspace(0,500,100)) plt.hist(df_1.均面積,bins=np.linspace(0,100,100)) plt.show()plt.scatter(df_1.建筑面積,df_1.房屋單價) plt.xlabel('面積',fontproperties='SimHei') plt.ylabel('單價',fontproperties='SimHei') plt.title('價位',fontproperties='SimHei') plt.show()plt.scatter(df_1.建筑面積,df_1.衛(wèi)) plt.xlabel('面積',fontproperties='SimHei') plt.ylabel('衛(wèi)個數(shù)',fontproperties='SimHei') plt.show()a=df_1.房屋單價.corr(df_1.建筑面積) print(a) cor=df_1.corr() print(type(cor)) cor=cor[abs(cor)>0.3] cor=cor.replace(1.0,None).dropna(axis=0,how='all').dropna(axis=1,how='all') print(cor[abs(cor)>.30]) #print(df_1) plt.scatter(df.建造年代,df.樓高) plt.ylabel('高度',fontproperties='SimHei') plt.title('年代與高度',fontproperties='SimHei') plt.xlim(1970,2020,1) plt.show()plt.hist(df.建造年代,bins=np.linspace(1970,2020)) plt.ylabel('在售數(shù)量',fontproperties='SimHei') plt.title('年份出售量',fontproperties='SimHei') plt.show()

5、主成分分析

主成分分析pca實(shí)現(xiàn)可視化分析

import pandas as pd import pymysql import numpy as np import sklearn.decomposition as dp import matplotlib.pyplot as plt from sklearn import preprocessingdef get_data(data_table):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')sql="select * from %s where 產(chǎn)權(quán)性質(zhì)='商品房住宅'" % data_tabledf=pd.read_sql(sql,conn,index_col='ID')conn.close()return dfdef data_revision(df):df=df.drop_duplicates(keep='first') #刪除重復(fù)的行df=df[~df.index.duplicated()] #刪除ID重復(fù)if df['建筑面積'].str[-3:].drop_duplicates(keep='first').count()>1:print('建筑面積異常,存在單位不統(tǒng)一')df['建筑面積']=df['建筑面積'].str[:-3]df['建筑面積'] = df['建筑面積'].astype(np.float)if df['房屋單價'].str[-4:].drop_duplicates(keep='first').count()>1:print('房屋單價異常,存在單位不統(tǒng)一')df['房屋單價']=df['房屋單價'].str[:-4]df['房屋單價'] = df['房屋單價'].astype(np.int)df['建造年代']=df['建造年代'].str[:-1]df['建造年代'] = df['建造年代'].astype(np.int)df['所在位置']=df['所在位置'].str.split('－').str[0]df=df.drop(['參考月供'],axis=1)return dfdef data_pca(df):y=df['房屋單價']X=df[['建造年代','建筑面積','所在位置']] # encoder=preprocessing.LabelEncoder()#所在位置自動分組X[['所在位置']]=X[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform)#單價進(jìn)行區(qū)間分組bins=[0,20000,50000,100000]labels=[1,2,3]y=pd.cut(y,bins,right=True,labels=labels)pca=dp.PCA(n_components=2) #加載PCA算法，降維2維值train_X=pca.fit_transform(X) #對原始數(shù)據(jù)降維x_=[]y_=[]for i in range(2,0,-1):for j in range(len(train_X)):if y[j]==i:x_.append(train_X[j][0])y_.append(train_X[j][1])plt.scatter(x_,y_)x_.clear()y_.clear()plt.show()def main():df=get_data('hours_data')data=data_revision(df)data_pca(data) # for i in data.columns: # print(data[i].head()) # print(data.count())main()

結(jié)果圖片

數(shù)據(jù)量3W+
記錄幾張圖片

總結(jié)

以上是生活随笔為你收集整理的房价数据爬取及分析的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。