爬虫苦训第一天,我的第一个爬虫!!!
生活随笔
收集整理的這篇文章主要介紹了
爬虫苦训第一天,我的第一个爬虫!!!
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
'''
經歷千辛萬苦終于寫成了自己的第一個爬蟲,哇咔咔! 抓取糗事段子里面的段子,循環抓取多頁 ''' import urllib.request import ssl import re import pickleweburl = "https://www.douban.com/"
#設置請求頭 headers = {'Accept': 'text/html, application/xhtml+xml, */*',# 'Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2545.400','DNT': '1','Connection': 'Keep-Alive','Host': 'www.qiushibaike.com'} def jokeCrawker(url):
#創建為經過驗證的上下文context = ssl._create_unverified_context()req = urllib.request.Request(url=url, headers=headers)response = urllib.request.urlopen(req, context=context)#data = str(response.read())data = response.read().decode("utf-8")
#. 不能匹配'\n',compole(pat, re.S),此處耽誤了好久,無語了pat = '<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'re_joke = re.compile(pat, re.S)jokeList = re_joke.findall(data)jokeDict = {}for div in jokeList:#name,名稱pat = r'<h2>(.*?)</h2>'re_n = re.compile(pat, re.S)name = re_n.findall(div)[0]#words,說的段子pat = '<div class="content">\n<span>(.*?)</span>'re_w = re.compile(pat, re.S)words = re_w.findall(div)[0]#處理多個換行,<br/>,非(數字,字母,下劃線,中文,以及中文標點符號)pat = '\\n{2,}|<br/>|[^\d\u4e00-\u9fa5(\u3002|\uff1f|\uff01|\uff0c|\u3001|' \'\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|' \'\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|' \'\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5)*]'word = re.sub(pat, '', words)jokeDict[name] = wordwith open("F:/糗事段子/file/qiushi.txt", "a+") as fp:#還能同一存儲到一個dict判斷重復,然后在同一寫入,就可以用'w',不過太占內存了#以后還能索引,看是否覆蓋(短時間內不會)for k, v in jokeDict.items():info = str(k + "說:" + v + "\n")fp.write(info)return True for i in range(10):webur2 = "https://www.qiushibaike.com/text/page/" + str(i) + "/"jokeCrawker(webur2)
經歷千辛萬苦終于寫成了自己的第一個爬蟲,哇咔咔! 抓取糗事段子里面的段子,循環抓取多頁 ''' import urllib.request import ssl import re import pickleweburl = "https://www.douban.com/"
#設置請求頭 headers = {'Accept': 'text/html, application/xhtml+xml, */*',# 'Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2545.400','DNT': '1','Connection': 'Keep-Alive','Host': 'www.qiushibaike.com'} def jokeCrawker(url):
#創建為經過驗證的上下文context = ssl._create_unverified_context()req = urllib.request.Request(url=url, headers=headers)response = urllib.request.urlopen(req, context=context)#data = str(response.read())data = response.read().decode("utf-8")
#. 不能匹配'\n',compole(pat, re.S),此處耽誤了好久,無語了pat = '<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'re_joke = re.compile(pat, re.S)jokeList = re_joke.findall(data)jokeDict = {}for div in jokeList:#name,名稱pat = r'<h2>(.*?)</h2>'re_n = re.compile(pat, re.S)name = re_n.findall(div)[0]#words,說的段子pat = '<div class="content">\n<span>(.*?)</span>'re_w = re.compile(pat, re.S)words = re_w.findall(div)[0]#處理多個換行,<br/>,非(數字,字母,下劃線,中文,以及中文標點符號)pat = '\\n{2,}|<br/>|[^\d\u4e00-\u9fa5(\u3002|\uff1f|\uff01|\uff0c|\u3001|' \'\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|' \'\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|' \'\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5)*]'word = re.sub(pat, '', words)jokeDict[name] = wordwith open("F:/糗事段子/file/qiushi.txt", "a+") as fp:#還能同一存儲到一個dict判斷重復,然后在同一寫入,就可以用'w',不過太占內存了#以后還能索引,看是否覆蓋(短時間內不會)for k, v in jokeDict.items():info = str(k + "說:" + v + "\n")fp.write(info)return True for i in range(10):webur2 = "https://www.qiushibaike.com/text/page/" + str(i) + "/"jokeCrawker(webur2)
?
轉載于:https://www.cnblogs.com/854594834-YT/p/10539711.html
總結
以上是生活随笔為你收集整理的爬虫苦训第一天,我的第一个爬虫!!!的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: C#学习-静态
- 下一篇: java创建对象 的初始化顺序