當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

百度爬虫

發布時間：2023/12/16 编程问答 27 豆豆

生活随笔收集整理的這篇文章主要介紹了百度爬虫小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

百度爬蟲

獲取人名

# coding: utf-8 import requests from lxml import etree from lxml.etree import HTMLParserproxies = {}#r=requests.get('http://www.baidu.com',proxies=proxies)''' headers = {"Host": "www.zhihu.com","Referer": "https://www.zhihu.com/",'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' }session = requests.session() response = session.get("https://www.zhihu.com", headers=headers, proxies=proxies,verify=False) #proxy.huawei.com '''#NameLists=[] def getName(link):print(link)NameList=[]r=requests.get(link,proxies=proxies)try:r.encoding='gbk'html=etree.HTML(r.text)NameList=html.xpath('//div[@class="i_cont_s"]/a/text()')#print(NameList)with open('Name.txt', 'a+') as f:f.write('\n'.join(NameList))except:print('-----------: ',link)passreturn(len(NameList))#print(NameList)#A-Z ''' baselink='http://www.manmankan.com/dy2013/mingxing/' for i in range(ord('A'),ord('Z')+1):link=baselink+chr(i)+'/'getName(link)page=2while(1):slink=link+'index_'+str(page)+'.shtml'lens=getName(slink)page+=1if(lens<1):break''' Links=[ 'http://www.manmankan.com/dy2013/mingxing/yanyuan/neidi/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/xianggang/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/taiwan/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/riben/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/oumei/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/hanguo/', 'http://www.manmankan.com/dy2013/mingxing/geshou/neidi/', 'http://www.manmankan.com/dy2013/mingxing/geshou/xianggang/', 'http://www.manmankan.com/dy2013/mingxing/geshou/taiwan/', 'http://www.manmankan.com/dy2013/mingxing/geshou/riben/', 'http://www.manmankan.com/dy2013/mingxing/geshou/oumei/', 'http://www.manmankan.com/dy2013/mingxing/geshou/hanguo/' ] for link in Links:getName(link)

爬照片

# coding: utf-8 ''' with open('Name.txt', 'r+') as f:NameList=f.read().splitlines() print(len(NameList)) ''' import requests import os import multiprocessing proxies = {"http": "http://d84105117:@DLB940920@proxycn2.huawei.com:8080/", #注意最后的'/'一定要有"https": "http://d84105117:@DLB940920@proxycn2.huawei.com:8080/" }def getManyPages(keyword,pages):params=[]for i in range(0,30*pages,30):params.append({'tn': 'resultjson_com','ipn': 'rj','ct': 201326592,'is': '','fp': 'result','queryWord': keyword,'cl': 2,'lm': -1,'ie': 'utf-8','oe': 'utf-8','adpicid': '','st': -1,'z': '','ic': 0,'word': keyword,'s': '','se': '','tab': '','width': '','height': '','face': 0,'istype': 2,'qc': '','nc': 1,'fr': '','pn': i,'rn': 30,'gsm': '1e','1536131285172': ''})url = 'https://image.baidu.com/search/acjson'urls = []for param in params:try:print(requests.get(url,params=param,proxies=proxies).url)urls.append(requests.get(url,params=param,proxies=proxies).json().get('data'))except Exception as e:passreturn urlsdef getImg(dataList, localPath):if not os.path.exists(localPath): # 新建文件夾os.mkdir(localPath)x = 0for list in dataList:for i in list:if i.get('thumbURL') != None:print('正在下載：%s' % i.get('thumbURL'))ir = requests.get(i.get('thumbURL'),proxies=proxies,timeout=15, verify=False)open(localPath + '%d.jpg' % x, 'wb').write(ir.content)x += 1else:pass#print('圖片鏈接不存在') def spider(keyword):print('正在處理',keyword)dataList = getManyPages(keyword,3) # 參數1:關鍵字，參數2:要下載的頁數getImg(dataList,keyword+'/') # 參數2:指定保存的路徑with open('ok.txt', 'a+') as f:f.write(keyword+'\n')if __name__ == '__main__':with open('Name.txt', 'r+') as f:NameList=f.read().splitlines()with open('ok.txt', 'r+') as f:OkList=f.read().splitlines()pool=multiprocessing.Pool(processes=4)for keyword in NameList:if keyword in OkList:print(keyword+' is already ok,continue-----')continuepool.apply_async(spider, args=(keyword, ))pool.close()pool.join()

圖片裁剪

# coding: utf-8 import mxnet as mx from mtcnn_detector import MtcnnDetector import cv2 import os import timeimport numpy as npdetector = MtcnnDetector(model_folder='model', ctx=mx.cpu(0), num_worker = 4 , accurate_landmark = False)base_dirs=r'D:\code\python\china\\' dirlist=os.listdir(base_dirs) for dirs in dirlist:savedir=r'D:\data\\'+dirsif not os.path.exists(savedir): # 新建文件夾os.mkdir(savedir)#else:# continuebase_dir=base_dirs+dirs+'\\'index=0imagelist=os.listdir(base_dir)while index<len(imagelist):imagecp=base_dir+imagelist[index] print(imagecp)img = cv2.imdecode(np.fromfile(imagecp, dtype=np.uint8), 1) results = detector.detect_face(img)if results is not None and len(results[0])==1:total_boxes = results[0]points = results[1]#draw = img.copy()b=total_boxes[0]print(b)try:bound0=(b[3]-b[1])/2bound1=(b[2]-b[0])/2b[1]-=bound0b[0]-=bound1if b[1]<0:b[1]=0if b[0]<0:b[0]=0b[2]+=bound1b[3]+=bound0if b[2]>img.shape[1]:b[2]=img.shape[1]if b[3]>img.shape[0]:b[3]=img.shape[0]imageok=img[int(b[1]):int(b[3]),int(b[0]):int(b[2])]if imageok.shape[0]>100 and imageok.shape[1]>100:diss=savedir+'\\'+imagelist[index]#cv2.imwrite(diss,imageok)cv2.imencode('.jpg', imageok)[1].tofile(diss)cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))#cv2.imshow("img",img)#key = cv2.waitKey(0)except:passindex+=1'''cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))for p in points:for i in range(5):cv2.circle(draw, (p[i], p[i + 5]), 1, (0, 0, 255), 2)cv2.imshow("img",draw)key = cv2.waitKey(0)''' '''if(results is not None and len(results[0])==1):try:b=results[0][0]imageok=img[int(b[1])-50:int(b[0])+50,int(b[3])-50:int(b[2])+50]if imageok.shape[0]<100 or imageok.shape[1]<100:continuediss=savedir+'\\'+imagelist[index]cv2.imwrite(diss,imageok)# cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))#cv2.imshow("img",imageok)#key = cv2.waitKey(0)except:pass''''''

MTCNN：
https://github.com/pangyupo/mxnet_mtcnn_face_detection
Reference：
https://blog.csdn.net/qq_32166627/article/details/60882964

總結

以上是生活随笔為你收集整理的百度爬虫的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

爬虫

上一篇：浅谈前端开发必备知识点及未来发展方向
下一篇：计算机网络家庭网络设计,为家庭设计一个局