日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程资源 > 编程问答 >内容正文

编程问答

百度爬虫

發布時間:2023/12/16 编程问答 27 豆豆
生活随笔 收集整理的這篇文章主要介紹了 百度爬虫 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

百度爬蟲

獲取人名

# coding: utf-8 import requests from lxml import etree from lxml.etree import HTMLParserproxies = {}#r=requests.get('http://www.baidu.com',proxies=proxies)''' headers = {"Host": "www.zhihu.com","Referer": "https://www.zhihu.com/",'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' }session = requests.session() response = session.get("https://www.zhihu.com", headers=headers, proxies=proxies,verify=False) #proxy.huawei.com '''#NameLists=[] def getName(link):print(link)NameList=[]r=requests.get(link,proxies=proxies)try:r.encoding='gbk'html=etree.HTML(r.text)NameList=html.xpath('//div[@class="i_cont_s"]/a/text()')#print(NameList)with open('Name.txt', 'a+') as f:f.write('\n'.join(NameList))except:print('-----------: ',link)passreturn(len(NameList))#print(NameList)#A-Z ''' baselink='http://www.manmankan.com/dy2013/mingxing/' for i in range(ord('A'),ord('Z')+1):link=baselink+chr(i)+'/'getName(link)page=2while(1):slink=link+'index_'+str(page)+'.shtml'lens=getName(slink)page+=1if(lens<1):break''' Links=[ 'http://www.manmankan.com/dy2013/mingxing/yanyuan/neidi/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/xianggang/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/taiwan/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/riben/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/oumei/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/hanguo/', 'http://www.manmankan.com/dy2013/mingxing/geshou/neidi/', 'http://www.manmankan.com/dy2013/mingxing/geshou/xianggang/', 'http://www.manmankan.com/dy2013/mingxing/geshou/taiwan/', 'http://www.manmankan.com/dy2013/mingxing/geshou/riben/', 'http://www.manmankan.com/dy2013/mingxing/geshou/oumei/', 'http://www.manmankan.com/dy2013/mingxing/geshou/hanguo/' ] for link in Links:getName(link)

爬照片

# coding: utf-8 ''' with open('Name.txt', 'r+') as f:NameList=f.read().splitlines() print(len(NameList)) ''' import requests import os import multiprocessing proxies = {"http": "http://d84105117:@DLB940920@proxycn2.huawei.com:8080/", #注意最后的'/'一定要有"https": "http://d84105117:@DLB940920@proxycn2.huawei.com:8080/" }def getManyPages(keyword,pages):params=[]for i in range(0,30*pages,30):params.append({'tn': 'resultjson_com','ipn': 'rj','ct': 201326592,'is': '','fp': 'result','queryWord': keyword,'cl': 2,'lm': -1,'ie': 'utf-8','oe': 'utf-8','adpicid': '','st': -1,'z': '','ic': 0,'word': keyword,'s': '','se': '','tab': '','width': '','height': '','face': 0,'istype': 2,'qc': '','nc': 1,'fr': '','pn': i,'rn': 30,'gsm': '1e','1536131285172': ''})url = 'https://image.baidu.com/search/acjson'urls = []for param in params:try:print(requests.get(url,params=param,proxies=proxies).url)urls.append(requests.get(url,params=param,proxies=proxies).json().get('data'))except Exception as e:passreturn urlsdef getImg(dataList, localPath):if not os.path.exists(localPath): # 新建文件夾os.mkdir(localPath)x = 0for list in dataList:for i in list:if i.get('thumbURL') != None:print('正在下載:%s' % i.get('thumbURL'))ir = requests.get(i.get('thumbURL'),proxies=proxies,timeout=15, verify=False)open(localPath + '%d.jpg' % x, 'wb').write(ir.content)x += 1else:pass#print('圖片鏈接不存在') def spider(keyword):print('正在處理',keyword)dataList = getManyPages(keyword,3) # 參數1:關鍵字,參數2:要下載的頁數getImg(dataList,keyword+'/') # 參數2:指定保存的路徑with open('ok.txt', 'a+') as f:f.write(keyword+'\n')if __name__ == '__main__':with open('Name.txt', 'r+') as f:NameList=f.read().splitlines()with open('ok.txt', 'r+') as f:OkList=f.read().splitlines()pool=multiprocessing.Pool(processes=4)for keyword in NameList:if keyword in OkList:print(keyword+' is already ok,continue-----')continuepool.apply_async(spider, args=(keyword, ))pool.close()pool.join()

圖片裁剪

# coding: utf-8 import mxnet as mx from mtcnn_detector import MtcnnDetector import cv2 import os import timeimport numpy as npdetector = MtcnnDetector(model_folder='model', ctx=mx.cpu(0), num_worker = 4 , accurate_landmark = False)base_dirs=r'D:\code\python\china\\' dirlist=os.listdir(base_dirs) for dirs in dirlist:savedir=r'D:\data\\'+dirsif not os.path.exists(savedir): # 新建文件夾os.mkdir(savedir)#else:# continuebase_dir=base_dirs+dirs+'\\'index=0imagelist=os.listdir(base_dir)while index<len(imagelist):imagecp=base_dir+imagelist[index] print(imagecp)img = cv2.imdecode(np.fromfile(imagecp, dtype=np.uint8), 1) results = detector.detect_face(img)if results is not None and len(results[0])==1:total_boxes = results[0]points = results[1]#draw = img.copy()b=total_boxes[0]print(b)try:bound0=(b[3]-b[1])/2bound1=(b[2]-b[0])/2b[1]-=bound0b[0]-=bound1if b[1]<0:b[1]=0if b[0]<0:b[0]=0b[2]+=bound1b[3]+=bound0if b[2]>img.shape[1]:b[2]=img.shape[1]if b[3]>img.shape[0]:b[3]=img.shape[0]imageok=img[int(b[1]):int(b[3]),int(b[0]):int(b[2])]if imageok.shape[0]>100 and imageok.shape[1]>100:diss=savedir+'\\'+imagelist[index]#cv2.imwrite(diss,imageok)cv2.imencode('.jpg', imageok)[1].tofile(diss)cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))#cv2.imshow("img",img)#key = cv2.waitKey(0)except:passindex+=1'''cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))for p in points:for i in range(5):cv2.circle(draw, (p[i], p[i + 5]), 1, (0, 0, 255), 2)cv2.imshow("img",draw)key = cv2.waitKey(0)''' '''if(results is not None and len(results[0])==1):try:b=results[0][0]imageok=img[int(b[1])-50:int(b[0])+50,int(b[3])-50:int(b[2])+50]if imageok.shape[0]<100 or imageok.shape[1]<100:continuediss=savedir+'\\'+imagelist[index]cv2.imwrite(diss,imageok)# cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))#cv2.imshow("img",imageok)#key = cv2.waitKey(0)except:pass''''''

MTCNN:
https://github.com/pangyupo/mxnet_mtcnn_face_detection
Reference:
https://blog.csdn.net/qq_32166627/article/details/60882964

總結

以上是生活随笔為你收集整理的百度爬虫的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。