日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當(dāng)前位置: 首頁 > 编程资源 > 编程问答 >内容正文

编程问答

房价数据爬取及分析

發(fā)布時間:2023/12/20 编程问答 26 豆豆
生活随笔 收集整理的這篇文章主要介紹了 房价数据爬取及分析 小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

房價數(shù)據(jù)爬取及分析

  • 1、爬取URL
  • 2、URL爬取房屋信息
  • 3、數(shù)據(jù)處理
  • 4、相關(guān)分析
  • 5、主成分分析
  • 結(jié)果圖片

上傳代碼只為記錄,有待提升

1、爬取URL

爬取所有URL

1、獲取區(qū)域連接
2、區(qū)域連接獲各道路連接
3、道路連接如果大于5頁,就獲取價位的信息

獲取的所有URL存入數(shù)據(jù)中
等級編號 類別 url
1 市 URL
1-1 區(qū) URL 1
1-1-1 路 URL 2
1-1-1-1 價格 URL 3

import requests from bs4 import BeautifulSoup import urllib3 urllib3.disable_warnings() #關(guān)閉HTTPS的警告 from sqlalchemy import create_engine import pandas as pddef gethtml(url):headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}try:r=requests.get(url,headers=headers,timeout=3,verify=False)r.raise_for_status()r.encoding='utf8'return r.textexcept Exception as e:print('error of ' , e.args)return ''def get_url(url,list_url,M):html=gethtml(url[1])demo=BeautifulSoup(html,'html.parser')div=demo.find_all('div',attrs={'class':'items'})try:if M==1:hf=div[0].find_all('a') #區(qū)域elif M==2:hf=div[0].find('div',attrs={'class':'sub-items'}).find_all('a') #道路URLelif M==3:hf=div[1].find_all('a') #價位elif M==4:hf=div[2].find_all('a') #面積elif M==5:hf=div[3].find_all('a') #房型else:passexcept Exception as e:print('error',e.args)#得到頁數(shù)page=999try:page=len(demo.find('div',attrs={'class':'sale-left'}).find('div' ,attrs={'class':'multi-page'}).find_all(['a','i']))-2except:try:num=len(demo.find('div',attrs={'class':'sale-left'}).find('ul',attrs={'id':'houselist-mod-new'}).find_all('a')) #篩選后展示個數(shù)if num==0:page=0else:page=1except:page=0if page>0: try:count=0for i in hf:count=count+1list_url[i.attrs['href']]=(i.text,i.attrs['href'],url[2]+'-'+str(count),page)except Exception as e:print('error',e.args)def save_mysql(data,table_url='hours_url_6'):conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #創(chuàng)建連接df=pd.DataFrame(data).Tdf.columns=['name','url','num','page'] pd.io.sql.to_sql(df,table_url,con=conn,if_exists = 'append',index=False) #導(dǎo)入數(shù)據(jù)庫if_exists = 'replace','append'conn.dispose()def main(): # item={'區(qū)域':0,'售價':1,'面積':2,'房型':3}#list_url={key,[title,url,url_no,page]} key=urlurl='https://xm.anjuke.com/sale/'url_no='1' #url等級編號ls_1={}list_url={}get_url(['城市',url,url_no],list_url,1) #獲取區(qū)域save_mysql(list_url)ls=list_url.copy()for depth in range(2,6):list_url.clear()ls_1.clear()count=0for i in ls:count+=1url=ls[i]get_url(url,list_url,depth)if len(list_url)>0:list_url['get_id']=Nonesave_mysql(list_url,'hours_url_8')for k in list_url:if list_url[k][3]>5:ls_1[k]=list_url[k]list_url.clear()print('\r','當(dāng)前深度:%s, 獲取進(jìn)度:%s/%s' % (depth,count,len(ls)),end='')print('\r','當(dāng)前進(jìn)度:%s , 獲取總數(shù):%s' % (depth,len(ls_1)))ls=ls_1.copy()def main_1():list_url={#'https://xm.anjuke.com/sale/siming/': ('思明', 'https://xm.anjuke.com/sale/siming/', '1-1', 8), #'https://xm.anjuke.com/sale/huli/': ('湖里', 'https://xm.anjuke.com/sale/huli/', '1-2', 8), #'https://xm.anjuke.com/sale/jimei/': ('集美', 'https://xm.anjuke.com/sale/jimei/', '1-3', 8), #'https://xm.anjuke.com/sale/haicang/': ('海滄', 'https://xm.anjuke.com/sale/haicang/', '1-4', 8), #'https://xm.anjuke.com/sale/tongana/': ('同安', 'https://xm.anjuke.com/sale/tongana/', '1-5', 8), 'https://xm.anjuke.com/sale/xiangana/': ('翔安', 'https://xm.anjuke.com/sale/xiangana/', '1-6', 8), 'https://xm.anjuke.com/sale/xiamenzhoubian/': ('廈門周邊', 'https://xm.anjuke.com/sale/xiamenzhoubian/', '1-7', 8)}ls=list_url.copy()ls_1={}list_url.clear()for depth in range(2,6):list_url.clear()ls_1.clear()count=0for i in ls:count+=1url=ls[i]get_url(url,list_url,depth)if len(list_url)>0:save_mysql(list_url,'hours_url_9')for k in list_url:if list_url[k][3]>5:ls_1[k]=list_url[k]list_url.clear()print('\r','當(dāng)前深度:%s, 獲取進(jìn)度:%s/%s' % (depth,count,len(ls)),end='')print('\r','當(dāng)前進(jìn)度:%s , 獲取總數(shù):%s' % (depth,len(ls_1)))ls=ls_1.copy() main()

2、URL爬取房屋信息

從ID獲取房源的數(shù)據(jù)
查看是否已獲取數(shù)據(jù)

import pandas as pd import pymysql from bs4 import BeautifulSoup import requests import re from sqlalchemy import create_engine from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning)def get_url(url):try:headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}req=requests.get(url,headers=headers,timeout=3,verify=False)req.raise_for_status()req.encoding='utf-8'return req.textexcept Exception as e:error_no=req.status_codeif error_no==404 or error_no==503:return error_noelse:print('error',e.args)return 1def html_demo(html,list_data):pat=re.compile(r'[\:]')demo=BeautifulSoup(html,'html.parser')div=demo.find('div',attrs={'class':'wrapper'})#房產(chǎn)信息li=demo.find_all('li',attrs={'class':'houseInfo-detail-item'})for i in li:div=i.find_all('div')count=1for j in div:if count==1:a=''.join(pat.sub('',j.text.strip()).split())elif count==2:b=''.join(pat.sub('',j.text.strip()).split())list_data[a]=belse:continuecount+=1 # 銷售信息demo.find('div',attrs={'class':'broker-wrap'})b_info=demo.find('div',attrs={'class':'broker-wrap'})brokercard_name=b_info.find('div',attrs={'class':'brokercard-name'}).text.split()[0]list_data['銷售']=brokercard_name#評分em=b_info.find_all('em')for i in range(3):list_data[pat.sub('',em[i*2].text)]=em[i*2+1].text#公司名稱try:gs_name=b_info.find('a',attrs={'class':'text-overflow'}).attrs['title'].split(':')list_data[gs_name[0]]=gs_name[1]except:passdef save_mysql(df,database='hours_data'):conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #創(chuàng)建連接pd.io.sql.to_sql(df,database,con=conn,if_exists = 'append',index=False) #導(dǎo)入數(shù)據(jù)庫if_exists = 'replace','append'conn.dispose()def update_id(id_num,database='hours_url_id'):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')cursor=conn.cursor()sql="update %s set get_id=1 where ID='%s'" %(database,id_num)cursor.execute(sql)conn.commit()cursor.close()conn.close()def get_id(id_table):conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')cursor=conn.cursor()sql='select distinct ID from %s where get_id is null '% id_tablecursor.execute(sql)data_id=cursor.fetchall()return data_idcursor.close()conn.close()def data_exists(table_name,url_no):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')cursor=conn.cursor()sql='select ID from %s where ID="%s"' %(table_name,url_no) cursor.execute(sql)return cursor.arraysizedef main():data=get_id('hours_url_id')start_url='https://xm.anjuke.com/prop/view/'list_data={}count=0n=len(data)for id in data:count+=1print('\r','獲取數(shù)據(jù)進(jìn)度:%s/%s' % (count,n),end='')count_id=data_exists('hours_url_id',id[0])if count_id >= 1:update_id(id[0])continuetry:url=start_url+id[0]html=get_url(url)if html==1: #其他異常跳過continueelif html==404: #404異常刪除passelif html==503:print('網(wǎng)頁需驗(yàn)證:503')breakelse:html_demo(html,list_data)list_data['ID']=id[0]df=pd.DataFrame(list_data,index=id)save_mysql(df)update_id(id[0])except Exception as e:count+=1print('error',url,e.args)continuemain()

3、數(shù)據(jù)處理

hours的數(shù)據(jù)處理部分

import numpy as np import pandas as pd import pymysqldef get_hours_data(): conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')sql='select * from %s ' % 'hours_data'df=pd.read_sql(sql,conn,index_col='ID')conn.close()return dfdef hours_data_extract(df):df.drop_duplicates(keep='first',inplace=True)df=df[~df.index.duplicated()]if df['建筑面積'].str[-3:].drop_duplicates(keep='first').count()>1:print('建筑面積異常,存在單位不統(tǒng)一')if df['房屋單價'].str[-4:].drop_duplicates(keep='first').count()>1:print('房屋單價異常,存在單位不統(tǒng)一')#錯誤數(shù)據(jù)修復(fù)df=df[(df.所在樓層.str[:1]!='共') & (df.所在樓層.str[:1]!='地')] # df.所在樓層=df.所在樓層.str.replace(['共90層','共74層'],['共9層','共7層'])#數(shù)據(jù)分類處理df.建筑面積=df.建筑面積.str[:-3].astype(np.float)df.房屋單價=df.房屋單價.str[:-4].astype(np.int)df.建造年代=df.建造年代.str[:-1].astype(np.int)df.所在位置=df.所在位置.str.split('-').str[0].replace(['思明','湖里','集美','海滄','同安','翔安','廈門周邊'],[1,2,3,4,5,6,7])df.房屋類型=df.房屋類型.replace(['普通住宅','平房','公寓','暫無'],[1,2,3,4])df[['室', '廳', '衛(wèi)']] = df['房屋戶型'].str.extract('(\d+)室(\d+)廳(\d+)衛(wèi)', expand=False).astype(np.int)df['均面積']=df['建筑面積']/(df['室']+df['廳'])df['樓高']=df.loc[:,'所在樓層'].str[4:].str[:-2].astype(np.int)df.樓高=df.樓高.replace([90,74],[9,7])df['層級']=df.loc[:,'所在樓層'].str[:2].replace(['低層','中層','高層'],[1,2,3]).astype(np.int)df.裝修程度=df.裝修程度.replace(['精裝修','簡單裝修','豪華裝修','毛坯'],[1,2,3,4])df.產(chǎn)權(quán)年限=df.產(chǎn)權(quán)年限.replace(['70年','50年','40年'],[1,2,3])df.配套電梯=df.配套電梯.replace(['無','有'],[0,1])df.房本年限=df.房本年限.replace(['滿五年','滿二年','不滿二年','暫無'],[1,2,3,4])df.產(chǎn)權(quán)性質(zhì)=df.產(chǎn)權(quán)性質(zhì).replace(['商品房住宅','商住兩用','單位集體自建房','使用權(quán)','保障性住房','動遷配套房','其他'],[1,2,3,4,5,6,7])df.唯一住房=df.唯一住房.replace(['是','否','暫無'],[1,0,0])df.房屋朝向=df.房屋朝向.replace(['東','南','西','北','東西','南北','東南','東北','西南','西北'],[1,2,3,4,5,6,7,8,9,10])df['price_group']=df.房屋單價//10000#刪除不需要字段df=df.drop(['所屬小區(qū)','房屋戶型','參考月供','所在樓層','參考首付','銷售','房源','服務(wù)','評價','工商注冊名稱','一手房源'],axis=1)return dfdef getdata():df=get_hours_data()data=hours_data_extract(df)return data

4、相關(guān)分析

相關(guān)性分析
分析數(shù)據(jù)的相關(guān)性

import hoursdata import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn import preprocessingdf=hoursdata.getdata()#print(df.count()) df=df[(df.產(chǎn)權(quán)性質(zhì)==1) & (df.房屋類型==1)]#print(df.drop_describe())#df[['所在位置']]=df[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform) #df[['房屋朝向']]=df[['房屋朝向']].apply(preprocessing.LabelEncoder().fit_transform)print(df[['均面積','建筑面積']].describe()) df_des=df.均面積.describe([.10,.90])df_1=df[(df['均面積']>=df_des['10%']) & (df['均面積']<=df_des['90%'])].copy() #df_1=df[(df['建筑面積']>=df_des[4]) & (df['建筑面積']<=df_des[6])].copy() print('****************\n',df_1[['均面積','建筑面積']].describe())#plt.show() plt.hist(df_1.建筑面積,bins=np.linspace(0,500,100)) plt.hist(df_1.均面積,bins=np.linspace(0,100,100)) plt.show()plt.scatter(df_1.建筑面積,df_1.房屋單價) plt.xlabel('面積',fontproperties='SimHei') plt.ylabel('單價',fontproperties='SimHei') plt.title('價位',fontproperties='SimHei') plt.show()plt.scatter(df_1.建筑面積,df_1.衛(wèi)) plt.xlabel('面積',fontproperties='SimHei') plt.ylabel('衛(wèi)個數(shù)',fontproperties='SimHei') plt.show()a=df_1.房屋單價.corr(df_1.建筑面積) print(a) cor=df_1.corr() print(type(cor)) cor=cor[abs(cor)>0.3] cor=cor.replace(1.0,None).dropna(axis=0,how='all').dropna(axis=1,how='all') print(cor[abs(cor)>.30]) #print(df_1) plt.scatter(df.建造年代,df.樓高) plt.ylabel('高度',fontproperties='SimHei') plt.title('年代與高度',fontproperties='SimHei') plt.xlim(1970,2020,1) plt.show()plt.hist(df.建造年代,bins=np.linspace(1970,2020)) plt.ylabel('在售數(shù)量',fontproperties='SimHei') plt.title('年份出售量',fontproperties='SimHei') plt.show()

5、主成分分析

主成分分析pca實(shí)現(xiàn)可視化分析

import pandas as pd import pymysql import numpy as np import sklearn.decomposition as dp import matplotlib.pyplot as plt from sklearn import preprocessingdef get_data(data_table):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')sql="select * from %s where 產(chǎn)權(quán)性質(zhì)='商品房住宅'" % data_tabledf=pd.read_sql(sql,conn,index_col='ID')conn.close()return dfdef data_revision(df):df=df.drop_duplicates(keep='first') #刪除重復(fù)的行df=df[~df.index.duplicated()] #刪除ID重復(fù)if df['建筑面積'].str[-3:].drop_duplicates(keep='first').count()>1:print('建筑面積異常,存在單位不統(tǒng)一')df['建筑面積']=df['建筑面積'].str[:-3]df['建筑面積'] = df['建筑面積'].astype(np.float)if df['房屋單價'].str[-4:].drop_duplicates(keep='first').count()>1:print('房屋單價異常,存在單位不統(tǒng)一')df['房屋單價']=df['房屋單價'].str[:-4]df['房屋單價'] = df['房屋單價'].astype(np.int)df['建造年代']=df['建造年代'].str[:-1]df['建造年代'] = df['建造年代'].astype(np.int)df['所在位置']=df['所在位置'].str.split('-').str[0]df=df.drop(['參考月供'],axis=1)return dfdef data_pca(df):y=df['房屋單價']X=df[['建造年代','建筑面積','所在位置']] # encoder=preprocessing.LabelEncoder()#所在位置自動分組X[['所在位置']]=X[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform)#單價進(jìn)行區(qū)間分組bins=[0,20000,50000,100000]labels=[1,2,3]y=pd.cut(y,bins,right=True,labels=labels)pca=dp.PCA(n_components=2) #加載PCA算法,降維2維值train_X=pca.fit_transform(X) #對原始數(shù)據(jù)降維x_=[]y_=[]for i in range(2,0,-1):for j in range(len(train_X)):if y[j]==i:x_.append(train_X[j][0])y_.append(train_X[j][1])plt.scatter(x_,y_)x_.clear()y_.clear()plt.show()def main():df=get_data('hours_data')data=data_revision(df)data_pca(data) # for i in data.columns: # print(data[i].head()) # print(data.count())main()

結(jié)果圖片

數(shù)據(jù)量3W+
記錄幾張圖片



總結(jié)

以上是生活随笔為你收集整理的房价数据爬取及分析的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯,歡迎將生活随笔推薦給好友。