當前位置：首頁 > 编程语言 > python >内容正文

python

抓糗百数据和图片的Python爬虫

發布時間：2024/4/14 python 22 豆豆

生活随笔收集整理的這篇文章主要介紹了抓糗百数据和图片的Python爬虫小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

2019獨角獸企業重金招聘Python工程師標準>>>

可以把文字內容，圖片鏈接寫到數據庫；
可以把圖片下載至本地的日期文件夾里，并把本地的相對鏈接也寫到數據庫。

只要安裝了mysql，把上面的幾個配置換成自己的就OK了。
可以建一個定時任務，基本上所有內容都能爬下來（不包含評論和用戶信息）。

廢話不說，上代碼：

#!/usr/bin/env?python #encoding:?utf-8 #author:?zengqiu import?urllib2 import?urllib from?bs4?import?BeautifulSoup import?MySQLdb import?datetime import?re import?urlparse import?os import?socket mysql_host?=?"localhost" mysql_port?=?3306 mysql_user?=?"root" mysql_password?=?"test" mysql_db_name?=?"qiushibaike" mysql_table_name?=?"qiushibaike" image_path?=?"/Users/admin/code/pythonCode/QB/QBImage" def?spider(url):#?user_agent?=?"Mozilla/4.0?(compatible;?MSIE?5.5;?Windows?NT)"#?headers?=?{'User-Agent':?user_agent}#?request?=?urllib2.Request(url,?headers?=?headers)req?=?urllib2.Request(url)req.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')req.add_header('Accept-Language','zh-CN,zh;q=0.8,en;q=0.6')req.add_header('Cache-Control','max-age=0')req.add_header('Connection','keep-alive')req.add_header('Referer',url)req.add_header('User-Agent','Mozilla/5.0?(Windows?NT?6.1;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/32.0.1660.0?Safari/537.36')response?=?urllib2.urlopen(req)soup?=?BeautifulSoup(response.read())results?=?[]for?content?in?soup.findAll("div",?"content",?title=True):result?=?{}result['content']?=?content.text#print?content.textthumb?=?content.findNext("div")if?thumb['class']?==?[u'thumb']:for?attr?in?thumb.a.img.attrs:if?attr?==?"src":url_img?=?thumb.a.img[attr]result['image']?=?url_img#print?url_imgfor?attr?in?content.attrs:if?attr?==?"title":date?=?content[attr]result['date']?=?date#print?dateresults.append(result)#for?result?in?results:#for?key?in?result:#print?"[%s]?="?%?key,?result[key]return?results def?create_database(database):conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?port=mysql_port)cur?=?conn.cursor()sql?=?"create?database?%s"?%?(database)try:cur.execute(sql)conn.commit()except:conn.rollback()conn.close() def?create_table(table):conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?db=mysql_db_name,?port=mysql_port,?charset="utf8")cur?=?conn.cursor()sql?=?"CREATE?TABLE?%s?(`id`?int(11)?NOT?NULL?AUTO_INCREMENT,?`content`?varchar(10000)?NULL,?`image`?varchar(1000)?NULL,?`date`?datetime?NULL,?`location`?varchar(1000)?NULL,?CONSTRAINT?entry?UNIQUE?(`content`(200),?`date`),?PRIMARY?KEY?(`id`))?ENGINE=MyISAM?DEFAULT?CHARACTER?SET=utf8?COLLATE=utf8_general_ci"?%?(table)try:cur.execute(sql)conn.commit()except:conn.rollback()conn.close() def?insert(table,?date,?content,?image="",?location=""):conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?db=mysql_db_name,?port=mysql_port,?charset="utf8")cur?=?conn.cursor()sql?=?"insert?ignore?into?"?+?table?+?"(date,?content,?image,?location)?values(%s,?%s,?%s,?%s)"params?=?(datetime.datetime.strptime(date,?'%Y-%m-%d?%H:%M:%S'),?content,?image,?location)try:cur.execute(sql,?params)conn.commit()except:conn.rollback()conn.close() def?download(url,?path):filename?=?re.split('/',?urlparse.urlparse(url).path)[-1]filepath?=?os.path.join(path,?filename)if?not?os.path.isfile(filepath):urllib.urlretrieve(url,?filepath)return?filename def?makedir(path):if?not?os.path.exists(path):os.makedirs(path) def?run():page?=?1enable?=?Truetry:conn?=?MySQLdb.connect(host=mysql_host,?user=mysql_user,?passwd=mysql_password,?db=mysql_db_name,?port=mysql_port,?charset="utf8")conn.close()except:create_database(mysql_db_name)create_table(mysql_table_name)while?enable:print?"page?is?%d"?%?pageurl?=?"http://www.qiushibaike.com/8hr/page/%d"?%?pageresults?=?spider(url)if?results:for?result?in?results:if?result.has_key('image'):subpath?=?re.split('?',?result['date'])[0]newpath?=?os.path.join(image_path,?subpath)makedir(newpath)try:filename?=?download(result['image'],?newpath)location?=?os.path.join(subpath,?filename)insert(mysql_table_name,?result['date'],?result['content'],?result['image'],?location)except:print?filename?+?"?is?not?exist"else:insert(mysql_table_name,?result['date'],?result['content'])#for?key?in?result:#print?"[%s]?="?%?key,?result[key]page?+=?1else:enable?=?False def?main():print?'Please?use?it?as?./qiushibaike.py'run() if?__name__?==?'__main__':main()

基于代碼https://github.com/zengqiu/spider/blob/master/qiushibaike.py 修改，原代碼在我的Mac上執行有問題，修改了幾個bug才搞定，第一次嘗試修改python代碼，挺折騰的。

剩下的就是想在本地做一個網頁，可以訪問數據庫按日期瀏覽每天的糗百內容，估計要用php做，誰如果有空能做一下，就太感謝了。

參考：http://www.v2ex.com/t/131750

轉載于:https://my.oschina.net/ioslighter/blog/357376

總結

以上是生活随笔為你收集整理的抓糗百数据和图片的Python爬虫的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：递归算法实例
下一篇： Python-统计svn变更代码行数