Python爬取京东商品信息以及评论存进MySQL
生活随笔
收集整理的這篇文章主要介紹了
Python爬取京东商品信息以及评论存进MySQL
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
目錄
構建mysql數據表
第一版:
第二版 :
第三版:
總結:
構建mysql數據表
問題:使用SQL alchemy時,非主鍵不能設置為自增長,但是我想讓這個非主鍵僅僅是為了作為索引,autoincrement=True無效,該怎么實現讓它自增長呢?
from sqlalchemy import String,Integer,Text,Column from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import scoped_session from sqlalchemy.ext.declarative import declarative_baseengine=create_engine("mysql+pymysql://root:root@127.0.0.1:3306/jdcrawl?charset=utf8",pool_size=200,max_overflow=300,echo=False )BASE=declarative_base() # 實例化class Goods(BASE):__tablename__='goods'id=Column(Integer(),primary_key=True,autoincrement=True)sku_id = Column(String(200), primary_key=True, autoincrement=False)name=Column(String(200))price=Column(String(200))comments_num=Column(Integer)shop=Column(String(200))link=Column(String(200))class Comments(BASE):__tablename__='comments'id=Column(Integer(),primary_key=True,autoincrement=True,nullable=False)sku_id=Column(String(200),primary_key=True,autoincrement=False)comments=Column(Text())BASE.metadata.create_all(engine) Session=sessionmaker(engine) sess_db=scoped_session(Session)第一版:
問題:爬取幾頁評論后就會爬取到空白頁,添加refer后依舊如此
嘗試解決方法:將獲取評論地方的線程池改為單線程,并每獲取一頁評論增加延時1s
# 不能爬太快!!!不然獲取不到評論from bs4 import BeautifulSoup import requests from urllib import parse import csv,json,re import threadpool import time from jd_mysqldb import Goods,Comments,sess_dbheaders={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918','Referer': 'https://www.jd.com/' }num=0 # 商品數量 comments_num=0 # 評論數量# 獲取商品信息和SkuId def getIndex(url):session=requests.Session()session.headers=headersglobal numres=session.get(url,headers=headers)print(res.status_code)res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')items=soup.select('li.gl-item')for item in items[:3]: # 爬取3個商品測試title=item.select_one('.p-name a em').text.strip().replace(' ','')price=item.select_one('.p-price strong').text.strip().replace('¥','')try:shop=item.select_one('.p-shopnum a').text.strip() # 獲取書籍時查找店鋪的方法except:shop=item.select_one('.p-shop a').text.strip() # 獲取其他商品時查找店鋪的方法link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))SkuId=re.search('\d+',link).group()comments_num=getCommentsNum(SkuId,session)print(SkuId,title, price, shop, link, comments_num)print("開始存入數據庫...")try:IntoGoods(SkuId,title, price, shop, link, comments_num)except Exception as e:print(e)sess_db.rollback()num += 1print("正在獲取評論...")# 獲取評論總頁數url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'headers['Referer'] = f'https://item.jd.com/{SkuId}.html'headers['Connection']='keep-alive'res2 = session.get(url1,headers=headers)res2.encoding = res2.apparent_encodingjson_data = json.loads(res2.text)max_page = json_data['maxPage'] # 經測試最多可獲取100頁評論,每頁10條args = []for i in range(0, max_page):# 使用此鏈接獲取評論得到的為json格式url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'# 使用此鏈接獲取評論得到的非json格式,需要提取# url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'args.append(([session,SkuId,url2], None))pool2 = threadpool.ThreadPool(2) # 2個線程reque2 = threadpool.makeRequests(getComments,args) # 創建任務for r in reque2:pool2.putRequest(r) # 提交任務到線程池pool2.wait()# 獲取評論總數量 def getCommentsNum(SkuId,sess):headers['Referer']=f'https://item.jd.com/{SkuId}.html'url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'res=sess.get(url,headers=headers)try:res.encoding=res.apparent_encodingjson_data=json.loads(res.text) # json格式轉為字典num=json_data['CommentsCount'][0]['CommentCount']return numexcept:return 'Error'# 獲取評論 def getComments(sess,SkuId,url2):global comments_numprint(url2)headers['Referer'] = f'https://item.jd.com/{SkuId}.html'res2 = sess.get(url2,headers=headers)res2.encoding='gbk'json_data=res2.text'''# 如果用url2_2需要進行如下操作提取jsonstart = res2.text.find('jQuery9287224(') + len('jQuery9287224(')end = res2.text.find(');')json_data=res2.text[start:end]'''dict_data = json.loads(json_data)try:comments=dict_data['comments']for item in comments:comment=item['content'].replace('\n','')# print(comment)comments_num+=1try:IntoComments(SkuId,comment)except Exception as e:print(e)sess_db.rollback()except:pass# 商品信息入庫 def IntoGoods(SkuId,title, price, shop, link, comments_num):goods_data=Goods(sku_id=SkuId,name=title,price=price,comments_num=comments_num,shop=shop,link=link)sess_db.add(goods_data)sess_db.commit()# 評論入庫 def IntoComments(SkuId,comment):comments_data=Comments(sku_id=SkuId,comments=comment)sess_db.add(comments_data)sess_db.commit()if __name__ == '__main__':start_time=time.time()urls=[]KEYWORD=parse.quote(input("請輸入要查詢的關鍵詞:"))for i in range(1,2): # 爬取一頁進行測試url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'urls.append(([url,],None)) # threadpool要求必須這樣寫pool=threadpool.ThreadPool(2) # 2個線程的線程池reque=threadpool.makeRequests(getIndex,urls) # 創建任務for r in reque:pool.putRequest(r) # 向線程池提交任務pool.wait() # 等待所有任務執行完畢print("共獲取{}件商品,獲得{}條評論,耗時{}".format(num,comments_num,time.time()-start_time))第二版 :
經測試,的確不會出現空白頁的情況
進一步優化:同時獲取2個以上商品的評論
# 不能爬太快!!!不然獲取不到評論 from bs4 import BeautifulSoup import requests from urllib import parse import csv,json,re import threadpool import time from jd_mysqldb import Goods,Comments,sess_dbheaders={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918','Referer': 'https://www.jd.com/' }num=0 # 商品數量 comments_num=0 # 評論數量# 獲取商品信息和SkuId def getIndex(url):session=requests.Session()session.headers=headersglobal numres=session.get(url,headers=headers)print(res.status_code)res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')items=soup.select('li.gl-item')for item in items[:2]: # 爬取2個商品測試title=item.select_one('.p-name a em').text.strip().replace(' ','')price=item.select_one('.p-price strong').text.strip().replace('¥','')try:shop=item.select_one('.p-shopnum a').text.strip() # 獲取書籍時查找店鋪的方法except:shop=item.select_one('.p-shop a').text.strip() # 獲取其他商品時查找店鋪的方法link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))SkuId=re.search('\d+',link).group()headers['Referer'] = f'https://item.jd.com/{SkuId}.html'headers['Connection'] = 'keep-alive'comments_num=getCommentsNum(SkuId,session)print(SkuId,title, price, shop, link, comments_num)print("開始將商品存入數據庫...")try:IntoGoods(SkuId,title, price, shop, link, comments_num)except Exception as e:print(e)sess_db.rollback()num += 1print("正在獲取評論...")# 獲取評論總頁數url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'res2 = session.get(url1,headers=headers)res2.encoding = res2.apparent_encodingjson_data = json.loads(res2.text)max_page = json_data['maxPage'] # 經測試最多可獲取100頁評論,每頁10條print("{}評論共{}頁".format(SkuId,max_page))if max_page==0:IntoComments(SkuId,'0')else:for i in range(0, max_page):# 使用此鏈接獲取評論得到的為json格式url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'# 使用此鏈接獲取評論得到的非json格式,需要提取# url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'print("開始獲取第{}頁評論:{}".format(i+1,url2) )getComments(session,SkuId,url2)time.sleep(1)# 獲取評論總數量 def getCommentsNum(SkuId,sess):url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'res=sess.get(url)try:res.encoding=res.apparent_encodingjson_data=json.loads(res.text) # json格式轉為字典num=json_data['CommentsCount'][0]['CommentCount']return numexcept:return 'Error'# 獲取評論 def getComments(sess,SkuId,url2):global comments_numres2 = sess.get(url2)res2.encoding=res2.apparent_encodingjson_data=res2.text'''# 如果用url2_2需要進行如下操作提取jsonstart = res2.text.find('jQuery9287224(') + len('jQuery9287224(')end = res2.text.find(');')json_data=res2.text[start:end]'''dict_data = json.loads(json_data)comments=dict_data['comments']for item in comments:comment=item['content'].replace('\n','')# print(comment)comments_num+=1try:IntoComments(SkuId,comment)except Exception as e:print(e)sess_db.rollback()# 商品信息入庫 def IntoGoods(SkuId,title, price, shop, link, comments_num):goods_data=Goods(sku_id=SkuId,name=title,price=price,comments_num=comments_num,shop=shop,link=link)sess_db.add(goods_data)sess_db.commit()# 評論入庫 def IntoComments(SkuId,comment):comments_data=Comments(sku_id=SkuId,comments=comment)sess_db.add(comments_data)sess_db.commit()if __name__ == '__main__':start_time=time.time()urls=[]KEYWORD=parse.quote(input("請輸入要查詢的關鍵詞:"))for i in range(1,2): # 爬取一頁進行測試url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'urls.append(([url,],None)) # threadpool要求必須這樣寫pool=threadpool.ThreadPool(2) # 2個線程的線程池reque=threadpool.makeRequests(getIndex,urls) # 創建任務for r in reque:pool.putRequest(r) # 向線程池提交任務pool.wait() # 等待所有任務執行完畢print("共獲取{}件商品,獲得{}條評論,耗時{}".format(num,comments_num,time.time()-start_time))第三版:
?。。。。不行,又出現空白頁了
# 不能爬太快!!!不然獲取不到評論 from bs4 import BeautifulSoup import requests from urllib import parse import csv,json,re import threadpool import time from jd_mysqldb import Goods,Comments,sess_dbheaders={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918','Referer': 'https://www.jd.com/' }num=0 # 商品數量 comments_num=0 # 評論數量# 獲取商品信息和SkuId def getIndex(url):global numskuids=[]session=requests.Session()session.headers=headersres=session.get(url,headers=headers)print(res.status_code)res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')items=soup.select('li.gl-item')for item in items[:3]: # 爬取3個商品測試title=item.select_one('.p-name a em').text.strip().replace(' ','')price=item.select_one('.p-price strong').text.strip().replace('¥','')try:shop=item.select_one('.p-shopnum a').text.strip() # 獲取書籍時查找店鋪的方法except:shop=item.select_one('.p-shop a').text.strip() # 獲取其他商品時查找店鋪的方法link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))SkuId=re.search('\d+',link).group()skuids.append(([SkuId,session],None))headers['Referer'] = f'https://item.jd.com/{SkuId}.html'headers['Connection'] = 'keep-alive'comments_num=getCommentsNum(SkuId,session) # 評論數量print(SkuId,title, price, shop, link, comments_num)print("開始將商品存入數據庫...")try:IntoGoods(SkuId,title, price, shop, link, comments_num)except Exception as e:print(e)sess_db.rollback()num += 1print("開始獲取評論并存入數據庫...")pool2=threadpool.ThreadPool(3) # 可同時獲取3個商品的評論task=threadpool.makeRequests(getComments,skuids)for r in task:pool2.putRequest(r)pool2.wait()# 獲取評論 def getComments(SkuId,sess):# 獲取評論總頁數url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'res2 = sess.get(url1, headers=headers)res2.encoding = res2.apparent_encodingjson_data = json.loads(res2.text)max_page = json_data['maxPage'] # 經測試最多可獲取100頁評論,每頁10條print("{}評論共{}頁".format(SkuId, max_page))if max_page == 0:IntoComments(SkuId, '0')else:for i in range(0, max_page):# 使用此鏈接獲取評論得到的為json格式url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'# 使用此鏈接獲取評論得到的非json格式,需要提取# url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'print("開始獲取第{}頁評論:{}".format(i + 1, url2))getComments_one(sess, SkuId, url2)time.sleep(1)# 獲取評論總數量 def getCommentsNum(SkuId,sess):url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'res=sess.get(url)try:res.encoding=res.apparent_encodingjson_data=json.loads(res.text) # json格式轉為字典num=json_data['CommentsCount'][0]['CommentCount']return numexcept:return 'Error'# 獲取單個評論 def getComments_one(sess,SkuId,url2):global comments_numres2 = sess.get(url2)res2.encoding=res2.apparent_encodingjson_data=res2.text'''# 如果用url2_2需要進行如下操作提取jsonstart = res2.text.find('jQuery9287224(') + len('jQuery9287224(')end = res2.text.find(');')json_data=res2.text[start:end]'''dict_data = json.loads(json_data)comments=dict_data['comments']for item in comments:comment=item['content'].replace('\n','')# print(comment)comments_num+=1try:IntoComments(SkuId,comment)except Exception as e:print(e)print("rollback!")sess_db.rollback()# 商品信息入庫 def IntoGoods(SkuId,title, price, shop, link, comments_num):goods_data=Goods(sku_id=SkuId,name=title,price=price,comments_num=comments_num,shop=shop,link=link)sess_db.add(goods_data)sess_db.commit()# 評論入庫 def IntoComments(SkuId,comment):comments_data=Comments(sku_id=SkuId,comments=comment)sess_db.add(comments_data)sess_db.commit()if __name__ == '__main__':start_time=time.time()urls=[]KEYWORD=parse.quote(input("請輸入要查詢的關鍵詞:"))for i in range(1,2): # 爬取一頁進行測試url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'urls.append(([url,],None)) # threadpool要求必須這樣寫pool=threadpool.ThreadPool(2) # 2個線程的線程池reque=threadpool.makeRequests(getIndex,urls) # 創建任務for r in reque:pool.putRequest(r) # 向線程池提交任務pool.wait() # 等待所有任務執行完畢print("共獲取{}件商品,獲得{}條評論,耗時{}".format(num,comments_num,time.time()-start_time))總結:
京東的反爬有點強,如果不想爬取到空白頁,只能用單線程加延時一條一條的爬
?
總結
以上是生活随笔為你收集整理的Python爬取京东商品信息以及评论存进MySQL的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Python爬取房天下租房信息实战
- 下一篇: HTTP14种常见状态码详解——来自《h