Python爬虫实战糗事百科实例
爬取糗事百科段子,假設(shè)頁(yè)面的URL是?http://www.qiushibaike.com/8hr/page/1
要求:
-
使用requests獲取頁(yè)面信息,用XPath / re 做數(shù)據(jù)提取
-
獲取每個(gè)帖子里的
用戶(hù)頭像鏈接、用戶(hù)姓名、段子內(nèi)容、點(diǎn)贊次數(shù)和評(píng)論次數(shù) -
保存到 json 文件內(nèi)
參考代碼
#qiushibaike.py#import urllib
#import re
#import chardetimport requests
from lxml import etreepage = 1
url = 'http://www.qiushibaike.com/8hr/page/' + str(page)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'}try:response = requests.get(url, headers=headers)resHtml = response.texthtml = etree.HTML(resHtml)result = html.xpath('//div[contains(@id,"qiushi_tag")]')for site in result:item = {}imgUrl = site.xpath('./div/a/img/@src')[0].encode('utf-8')username = site.xpath('./div/a/@title')[0].encode('utf-8')#username = site.xpath('.//h2')[0].textcontent = site.xpath('.//div[@class="content"]/span')[0].text.strip().encode('utf-8')# 投票次數(shù)vote = site.xpath('.//i')[0].text#print site.xpath('.//*[@class="number"]')[0].text# 評(píng)論信息comments = site.xpath('.//i')[1].textprint imgUrl, username, content, vote, commentsexcept Exception, e:print e
演示效果
?
多線(xiàn)程糗事百科案例
案例要求參考上面糗事百科單進(jìn)程案例
Queue(隊(duì)列對(duì)象)
Queue是python中的標(biāo)準(zhǔn)庫(kù),可以直接import Queue引用;隊(duì)列是線(xiàn)程間最常用的交換數(shù)據(jù)的形式
python下多線(xiàn)程的思考
對(duì)于資源,加鎖是個(gè)重要的環(huán)節(jié)。因?yàn)閜ython原生的list,dict等,都是not thread safe的。而Queue,是線(xiàn)程安全的,因此在滿(mǎn)足使用條件下,建議使用隊(duì)列
-
初始化: class Queue.Queue(maxsize) FIFO 先進(jìn)先出
-
包中的常用方法:
-
Queue.qsize() 返回隊(duì)列的大小
-
Queue.empty() 如果隊(duì)列為空,返回True,反之False
-
Queue.full() 如果隊(duì)列滿(mǎn)了,返回True,反之False
-
Queue.full 與 maxsize 大小對(duì)應(yīng)
-
Queue.get([block[, timeout]])獲取隊(duì)列,timeout等待時(shí)間
-
-
創(chuàng)建一個(gè)“隊(duì)列”對(duì)象
- import Queue
- myqueue = Queue.Queue(maxsize = 10)
-
將一個(gè)值放入隊(duì)列中
- myqueue.put(10)
-
將一個(gè)值從隊(duì)列中取出
- myqueue.get()
多線(xiàn)程示意圖
?
# -*- coding:utf-8 -*-
import requests
from lxml import etree
from Queue import Queue
import threading
import time
import jsonclass thread_crawl(threading.Thread):'''抓取線(xiàn)程類(lèi)'''def __init__(self, threadID, q):threading.Thread.__init__(self)self.threadID = threadIDself.q = qdef run(self):print "Starting " + self.threadIDself.qiushi_spider()print "Exiting ", self.threadIDdef qiushi_spider(self):# page = 1while True:if self.q.empty():breakelse:page = self.q.get()print 'qiushi_spider=', self.threadID, ',page=', str(page)url = 'http://www.qiushibaike.com/8hr/page/' + str(page) + '/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'}# 多次嘗試失敗結(jié)束、防止死循環(huán)timeout = 4while timeout > 0:timeout -= 1try:content = requests.get(url, headers=headers)data_queue.put(content.text)breakexcept Exception, e:print 'qiushi_spider', eif timeout < 0:print 'timeout', urlclass Thread_Parser(threading.Thread):'''頁(yè)面解析類(lèi);'''def __init__(self, threadID, queue, lock, f):threading.Thread.__init__(self)self.threadID = threadIDself.queue = queueself.lock = lockself.f = fdef run(self):print 'starting ', self.threadIDglobal total, exitFlag_Parserwhile not exitFlag_Parser:try:'''調(diào)用隊(duì)列對(duì)象的get()方法從隊(duì)頭刪除并返回一個(gè)項(xiàng)目??蛇x參數(shù)為block,默認(rèn)為T(mén)rue。如果隊(duì)列為空且block為T(mén)rue,get()就使調(diào)用線(xiàn)程暫停,直至有項(xiàng)目可用。如果隊(duì)列為空且block為False,隊(duì)列將引發(fā)Empty異常。'''item = self.queue.get(False)if not item:passself.parse_data(item)self.queue.task_done()print 'Thread_Parser=', self.threadID, ',total=', totalexcept:passprint 'Exiting ', self.threadIDdef parse_data(self, item):'''解析網(wǎng)頁(yè)函數(shù):param item: 網(wǎng)頁(yè)內(nèi)容:return:'''global totaltry:html = etree.HTML(item)result = html.xpath('//div[contains(@id,"qiushi_tag")]')for site in result:try:imgUrl = site.xpath('.//img/@src')[0]title = site.xpath('.//h2')[0].textcontent = site.xpath('.//div[@class="content"]/span')[0].text.strip()vote = Nonecomments = Nonetry:vote = site.xpath('.//i')[0].textcomments = site.xpath('.//i')[1].textexcept:passresult = {'imgUrl': imgUrl,'title': title,'content': content,'vote': vote,'comments': comments,}with self.lock:# print 'write %s' % json.dumps(result)self.f.write(json.dumps(result, ensure_ascii=False).encode('utf-8') + "\n")except Exception, e:print 'site in result', eexcept Exception, e:print 'parse_data', ewith self.lock:total += 1data_queue = Queue()
exitFlag_Parser = False
lock = threading.Lock()
total = 0def main():output = open('qiushibaike.json', 'a')#初始化網(wǎng)頁(yè)頁(yè)碼page從1-10個(gè)頁(yè)面pageQueue = Queue(50)for page in range(1, 11):pageQueue.put(page)#初始化采集線(xiàn)程crawlthreads = []crawlList = ["crawl-1", "crawl-2", "crawl-3"]for threadID in crawlList:thread = thread_crawl(threadID, pageQueue)thread.start()crawlthreads.append(thread)#初始化解析線(xiàn)程parserListparserthreads = []parserList = ["parser-1", "parser-2", "parser-3"]#分別啟動(dòng)parserListfor threadID in parserList:thread = Thread_Parser(threadID, data_queue, lock, output)thread.start()parserthreads.append(thread)# 等待隊(duì)列清空while not pageQueue.empty():pass# 等待所有線(xiàn)程完成for t in crawlthreads:t.join()while not data_queue.empty():pass# 通知線(xiàn)程是時(shí)候退出global exitFlag_ParserexitFlag_Parser = Truefor t in parserthreads:t.join()print "Exiting Main Thread"with lock:output.close()if __name__ == '__main__':main()
?
?
總結(jié)
以上是生活随笔為你收集整理的Python爬虫实战糗事百科实例的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: Python:数据提取之JSON与Jso
- 下一篇: Python:Selenium和Phan