當前位置：首頁 > 编程语言 > python >内容正文

python

多线程百度网盘爬虫Python完整源码

發布時間：2024/7/23 python 27 豆豆

生活随笔收集整理的這篇文章主要介紹了多线程百度网盘爬虫Python完整源码小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

#coding: utf8import re #正則表達式模塊import urllib2 #獲取URLs的組件import timefrom Queue import Queueimport threading, errno, datetimeimport jsonimport requests #Requests is an Apache2 Licensed HTTP libraryimport MySQLdb as mdbDB_HOST = '127.0.0.1'DB_USER = 'root'DB_PASS = ''#以下是正則匹配規則re_start = re.compile(r'start=(\d+)') #\d 表示0-9 任意一個數字后面有+號說明這個0-9單個數位出現一到多次比如21312314re_uid = re.compile(r'query_uk=(\d+)') #查詢編號re_urlid = re.compile(r'&urlid=(\d+)') #url編號ONEPAGE = 20 #一頁數據量ONESHAREPAGE = 20 #一頁分享連接量#缺少專輯列表URL_SHARE = 'http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&start={start}&limit=20&query_uk={uk}&urlid={id}' #獲得分享列表URL_FOLLOW = 'http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk={uk}&limit=20&start={start}&urlid={id}' #獲得訂閱列表URL_FANS = 'http://yun.baidu.com/pcloud/friend/getfanslist?query_uk={uk}&limit=20&start={start}&urlid={id}' # 獲取關注列表QNUM = 1000hc_q = Queue(20) #請求隊列hc_r = Queue(QNUM) #接收隊列success = 0failed = 0def req_worker(inx): #請求s = requests.Session() #請求對象while True:req_item = hc_q.get() #獲得請求項req_type = req_item[0] #請求類型，分享?訂閱？粉絲？url = req_item[1] #urlr = s.get(url) #通過url獲得數據hc_r.put((r.text, url)) #將獲得數據文本和url放入接收隊列print "req_worker#", inx, url #inx 線程編號； url 分析了的 urldef response_worker(): #處理工作dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')dbcurr = dbconn.cursor()dbcurr.execute('SET NAMES utf8')dbcurr.execute('set global wait_timeout=60000') #以上皆是數據庫操作while True:"""#正則備注match() 決定 RE 是否在字符串剛開始的位置匹配search() 掃描字符串，找到這個 RE 匹配的位置findall() 找到 RE 匹配的所有子串，并把它們作為一個列表返回finditer() 找到 RE 匹配的所有子串，并把它們作為一個迭代器返回百度頁面鏈接：http://pan.baidu.com/share/link?shareid=3685432306&uk=1798788396&from=hotrecuk 其實用戶id值"""metadata, effective_url = hc_r.get() #獲得metadata（也就是前面的r.text）和有效的url#print "response_worker:", effective_urltry:tnow = int(time.time()) #獲得當前時間id = re_urlid.findall(effective_url)[0] #獲得re_urlid用戶編號start = re_start.findall(effective_url)[0] #獲得start用戶編號if True:if 'getfollowlist' in effective_url: #type = 1，也就是訂閱類follows = json.loads(metadata) #以將文本數據轉化成json數據格式返回uid = re_uid.findall(effective_url)[0] #獲得re_uid，查詢編號if "total_count" in follows.keys() and follows["total_count"]>0 and str(start) == "0": #獲得訂閱數量for i in range((follows["total_count"]-1)/ONEPAGE): #開始一頁一頁獲取有用信息try:dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 1, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))#存儲url編號，訂閱中有用戶編號，start表示從多少條數據開始獲取，初始status=0為未分析狀態except Exception as ex:print "E1", str(ex)passif "follow_list" in follows.keys(): #如果訂閱者也訂閱了，即擁有follow_listfor item in follows["follow_list"]:try:dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)' % (item['follow_uk'], item['follow_uname'], str(tnow)))#存儲訂閱這的用戶編號，用戶名，入庫時間except Exception as ex:print "E13", str(ex)passelse:print "delete 1", uid, startdbcurr.execute('delete from urlids where uk=%s and type=1 and start>%s' % (uid, start))elif 'getfanslist' in effective_url: #type = 2,也就是粉絲列表fans = json.loads(metadata)uid = re_uid.findall(effective_url)[0]if "total_count" in fans.keys() and fans["total_count"]>0 and str(start) == "0":for i in range((fans["total_count"]-1)/ONEPAGE):try:dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 2, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))except Exception as ex:print "E2", str(ex)passif "fans_list" in fans.keys():for item in fans["fans_list"]:try:dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)' % (item['fans_uk'], item['fans_uname'], str(tnow)))except Exception as ex:print "E23", str(ex)passelse:print "delete 2", uid, startdbcurr.execute('delete from urlids where uk=%s and type=2 and start>%s' % (uid, start))else: #type=0，也即是分享列表shares = json.loads(metadata)uid = re_uid.findall(effective_url)[0]if "total_count" in shares.keys() and shares["total_count"]>0 and str(start) == "0":for i in range((shares["total_count"]-1)/ONESHAREPAGE):try:dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 0, 0)' % (uid, str(ONESHAREPAGE*(i+1)), str(ONESHAREPAGE)))except Exception as ex:print "E3", str(ex)passif "records" in shares.keys():for item in shares["records"]:try:dbcurr.execute('INSERT INTO share(userid, filename, shareid, status) VALUES(%s, "%s", %s, 0)' % (uid, item['title'], item['shareid'])) #item['title']恰好是文件名稱#返回的json信息：except Exception as ex:#print "E33", str(ex), itempasselse:print "delete 0", uid, startdbcurr.execute('delete from urlids where uk=%s and type=0 and start>%s' % (uid, str(start)))dbcurr.execute('delete from urlids where id=%s' % (id, ))dbconn.commit()except Exception as ex:print "E5", str(ex), iddbcurr.close()dbconn.close() #關閉數據庫def worker():global success, faileddbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')dbcurr = dbconn.cursor()dbcurr.execute('SET NAMES utf8')dbcurr.execute('set global wait_timeout=60000')#以上是數據庫相關設置while True:#dbcurr.execute('select * from urlids where status=0 order by type limit 1')dbcurr.execute('select * from urlids where status=0 and type>0 limit 1') #type>0,為非分享列表d = dbcurr.fetchall()#每次取出一條數據出來#print dif d: #如果數據存在id = d[0][0] #請求url編號uk = d[0][1] #用戶編號start = d[0][2]limit = d[0][3]type = d[0][4] #哪種類型dbcurr.execute('update urlids set status=1 where id=%s' % (str(id),)) #狀態更新為1，已經訪問過了url = ""if type == 0: #分享url = URL_SHARE.format(uk=uk, start=start, id=id).encode('utf-8') #分享列表格式化#query_uk uk 查詢編號#start#urlid id url編號elif type == 1: #訂閱url = URL_FOLLOW.format(uk=uk, start=start, id=id).encode('utf-8') #訂閱列表格式化elif type == 2: #粉絲url = URL_FANS.format(uk=uk, start=start, id=id).encode('utf-8') #關注列表格式化if url:hc_q.put((type, url)) #如果url存在，則放入請求隊列，type表示從哪里獲得數據#通過以上的url就可以獲得相應情況下的數據的json數據格式，如分享信息的，訂閱信息的，粉絲信息的#print "processed", urlelse: #否則從訂閱者或者粉絲的引出人中獲得信息來存儲，這個過程是爬蟲樹的下一層擴展dbcurr.execute('select * from user where status=0 limit 1000')d = dbcurr.fetchall()if d:for item in d:try:dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 0, 0)' % (item[1], str(ONESHAREPAGE)))#uk 查詢號，其實是用戶編號#start 從第1條數據出發獲取信息#dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 1, 0)' % (item[1], str(ONEPAGE)))dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 2, 0)' % (item[1], str(ONEPAGE)))dbcurr.execute('update user set status=1 where userid=%s' % (item[1],)) #做個標志，該條數據已經訪問過了#跟新了分享，訂閱，粉絲三部分數據except Exception as ex:print "E6", str(ex)else:time.sleep(1)dbconn.commit()dbcurr.close()dbconn.close()def main():print 'starting at:',now()for item in range(16): t = threading.Thread(target = req_worker, args = (item,))t.setDaemon(True)t.start() #請求線程開啟，共開啟16個線程s = threading.Thread(target = worker, args = ())s.setDaemon(True)s.start() #worker線程開啟response_worker() #response_worker開始工作print 'all Done at:', now()

進一步交流學習可以點擊查看小編的個人資料，有驚喜!

總結

以上是生活随笔為你收集整理的多线程百度网盘爬虫Python完整源码的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： Java技术大咖为什么都有写博客的习惯呢
下一篇： websocket python爬虫_p