php抓取统计局区划代码,抓取国家统计局的代码和名称,爬取,区域,划分,及
from bs4 import BeautifulSoup
import pandas as pd
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
獲取一級代碼、名稱、下一級鏈接
通過設置參數originUrl來調整爬取的年份
def getOneLevelCodeName(originUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'):
web = requests.get(originUrl,headers=headers) #獲取網頁
web.encoding = web.apparent_encoding #設置編碼
soup = BeautifulSoup(web.text,'html.parser') #解析網頁
provinceList = soup.select('.provincetr') #查找類名為provincetr的內容
oneLevelWeb = []
for table in provinceList:
for province in table.select('a'):
oneLevelWeb.append((province['href'],province.text))#獲取下一級短鏈接、獲取省名
oneLevelWebUrl = [(url[0][0:2]+'0000000000','中國-'+url[1],originUrl[0:54]+url[0]) for url in oneLevelWeb] #構建區劃代碼、省名、下一級鏈接
return oneLevelWebUrl
獲取二級代碼、名稱、下一級鏈接
#根據一級鏈接,獲取下一級
def getSecodLevelCodeName(proLevelName=None,url='None'):
if proLevelName is None or url == 'None':
pass
else:
web = requests.get(url,headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text,'html.parser')
secondLevelCodeNameList = soup.select('.citytr')
retList = []
for tag in secondLevelCodeNameList:
if tag.text[12:] == '市轄區':
retList.append((tag.text[0:12],proLevelName+'-'+proLevelName.split('-')[-1]+tag.text[12:],url[0:54]+tag.select('a')[0]['href']))
else:
retList.append((tag.text[0:12],proLevelName+'-'+tag.text[12:],url[0:54]+tag.select('a')[0]['href']))
return retList
獲取三級代碼、名稱、下一級鏈接
#根據二級鏈接,獲取下一級
def getThirdLevelCodeName(proLevelName=None,url='None'):
if proLevelName is None or url == 'None':
pass
else:
web = requests.get(url,headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text,'html.parser')
thirdLevelCodeNameList = soup.select('.countytr')
retList = []
for tag in thirdLevelCodeNameList:
try:
retList.append((tag.text[0:12],proLevelName+'-'+tag.text[12:],url[0:56]+'/'+tag.select('a')[0]['href']))
except:
retList.append((tag.text[0:12],proLevelName+'-'+proLevelName.split('-')[-1]+tag.text[12:],'None'))
return retList
獲取四級代碼、名稱、下一級鏈接
#根據三級鏈接,獲取下一級
def getFourthLevelCodeName(proLevelName=None,url='None'):
if proLevelName is None or url == 'None':
pass
else:
web = requests.get(url,headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text,'html.parser')
fourthLevelCodeNameList = soup.select('.towntr')
retList = []
for tag in fourthLevelCodeNameList:
retList.append((tag.text[0:12],proLevelName+'-'+tag.text[12:],url[0:60]+tag.select('a')[0]['href']))
return retList
轉為DataFrame,輸出excel文件
pd_oneLevel = pd.DataFrame(oneLevel)
pd_oneLevel
pd_secondLevel = pd.concat([pd.DataFrame(data) for data in secondLevel])
pd_secondLevel
pd_thirdLevel = pd.concat([pd.DataFrame(data) for data in thirdLevel])
pd_thirdLevel
pd_fourthLevel = pd.concat([pd.DataFrame(data) for data in fourthLevel])
pd_fourthLevel
pd_allLevel = pd.concat([pd_oneLevel,pd_secondLevel,pd_thirdLevel,pd_fourthLevel],ignore_index=True)
pd_allLevel.columns = ['區劃代碼','名稱','下一級網址']
pd_allLevel
保存到當前目錄
import os
pd_allLevel.to_excel(r''+os.path.realpath('__file__')[0:-8]+'2018區劃代碼及名稱.xlsx',index=False)
創作挑戰賽新人創作獎勵來咯,堅持創作打卡瓜分現金大獎總結
以上是生活随笔為你收集整理的php抓取统计局区划代码,抓取国家统计局的代码和名称,爬取,区域,划分,及的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 怎么查询开户行
- 下一篇: php ip处理函数,PHP取ip地址函