百度爬虫
百度爬蟲
獲取人名
# coding: utf-8 import requests from lxml import etree from lxml.etree import HTMLParserproxies = {}#r=requests.get('http://www.baidu.com',proxies=proxies)''' headers = {"Host": "www.zhihu.com","Referer": "https://www.zhihu.com/",'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' }session = requests.session() response = session.get("https://www.zhihu.com", headers=headers, proxies=proxies,verify=False) #proxy.huawei.com '''#NameLists=[] def getName(link):print(link)NameList=[]r=requests.get(link,proxies=proxies)try:r.encoding='gbk'html=etree.HTML(r.text)NameList=html.xpath('//div[@class="i_cont_s"]/a/text()')#print(NameList)with open('Name.txt', 'a+') as f:f.write('\n'.join(NameList))except:print('-----------: ',link)passreturn(len(NameList))#print(NameList)#A-Z ''' baselink='http://www.manmankan.com/dy2013/mingxing/' for i in range(ord('A'),ord('Z')+1):link=baselink+chr(i)+'/'getName(link)page=2while(1):slink=link+'index_'+str(page)+'.shtml'lens=getName(slink)page+=1if(lens<1):break''' Links=[ 'http://www.manmankan.com/dy2013/mingxing/yanyuan/neidi/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/xianggang/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/taiwan/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/riben/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/oumei/', 'http://www.manmankan.com/dy2013/mingxing/yanyuan/hanguo/', 'http://www.manmankan.com/dy2013/mingxing/geshou/neidi/', 'http://www.manmankan.com/dy2013/mingxing/geshou/xianggang/', 'http://www.manmankan.com/dy2013/mingxing/geshou/taiwan/', 'http://www.manmankan.com/dy2013/mingxing/geshou/riben/', 'http://www.manmankan.com/dy2013/mingxing/geshou/oumei/', 'http://www.manmankan.com/dy2013/mingxing/geshou/hanguo/' ] for link in Links:getName(link)爬照片
# coding: utf-8 ''' with open('Name.txt', 'r+') as f:NameList=f.read().splitlines() print(len(NameList)) ''' import requests import os import multiprocessing proxies = {"http": "http://d84105117:@DLB940920@proxycn2.huawei.com:8080/", #注意最后的'/'一定要有"https": "http://d84105117:@DLB940920@proxycn2.huawei.com:8080/" }def getManyPages(keyword,pages):params=[]for i in range(0,30*pages,30):params.append({'tn': 'resultjson_com','ipn': 'rj','ct': 201326592,'is': '','fp': 'result','queryWord': keyword,'cl': 2,'lm': -1,'ie': 'utf-8','oe': 'utf-8','adpicid': '','st': -1,'z': '','ic': 0,'word': keyword,'s': '','se': '','tab': '','width': '','height': '','face': 0,'istype': 2,'qc': '','nc': 1,'fr': '','pn': i,'rn': 30,'gsm': '1e','1536131285172': ''})url = 'https://image.baidu.com/search/acjson'urls = []for param in params:try:print(requests.get(url,params=param,proxies=proxies).url)urls.append(requests.get(url,params=param,proxies=proxies).json().get('data'))except Exception as e:passreturn urlsdef getImg(dataList, localPath):if not os.path.exists(localPath): # 新建文件夾os.mkdir(localPath)x = 0for list in dataList:for i in list:if i.get('thumbURL') != None:print('正在下載:%s' % i.get('thumbURL'))ir = requests.get(i.get('thumbURL'),proxies=proxies,timeout=15, verify=False)open(localPath + '%d.jpg' % x, 'wb').write(ir.content)x += 1else:pass#print('圖片鏈接不存在') def spider(keyword):print('正在處理',keyword)dataList = getManyPages(keyword,3) # 參數1:關鍵字,參數2:要下載的頁數getImg(dataList,keyword+'/') # 參數2:指定保存的路徑with open('ok.txt', 'a+') as f:f.write(keyword+'\n')if __name__ == '__main__':with open('Name.txt', 'r+') as f:NameList=f.read().splitlines()with open('ok.txt', 'r+') as f:OkList=f.read().splitlines()pool=multiprocessing.Pool(processes=4)for keyword in NameList:if keyword in OkList:print(keyword+' is already ok,continue-----')continuepool.apply_async(spider, args=(keyword, ))pool.close()pool.join()圖片裁剪
# coding: utf-8 import mxnet as mx from mtcnn_detector import MtcnnDetector import cv2 import os import timeimport numpy as npdetector = MtcnnDetector(model_folder='model', ctx=mx.cpu(0), num_worker = 4 , accurate_landmark = False)base_dirs=r'D:\code\python\china\\' dirlist=os.listdir(base_dirs) for dirs in dirlist:savedir=r'D:\data\\'+dirsif not os.path.exists(savedir): # 新建文件夾os.mkdir(savedir)#else:# continuebase_dir=base_dirs+dirs+'\\'index=0imagelist=os.listdir(base_dir)while index<len(imagelist):imagecp=base_dir+imagelist[index] print(imagecp)img = cv2.imdecode(np.fromfile(imagecp, dtype=np.uint8), 1) results = detector.detect_face(img)if results is not None and len(results[0])==1:total_boxes = results[0]points = results[1]#draw = img.copy()b=total_boxes[0]print(b)try:bound0=(b[3]-b[1])/2bound1=(b[2]-b[0])/2b[1]-=bound0b[0]-=bound1if b[1]<0:b[1]=0if b[0]<0:b[0]=0b[2]+=bound1b[3]+=bound0if b[2]>img.shape[1]:b[2]=img.shape[1]if b[3]>img.shape[0]:b[3]=img.shape[0]imageok=img[int(b[1]):int(b[3]),int(b[0]):int(b[2])]if imageok.shape[0]>100 and imageok.shape[1]>100:diss=savedir+'\\'+imagelist[index]#cv2.imwrite(diss,imageok)cv2.imencode('.jpg', imageok)[1].tofile(diss)cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))#cv2.imshow("img",img)#key = cv2.waitKey(0)except:passindex+=1'''cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))for p in points:for i in range(5):cv2.circle(draw, (p[i], p[i + 5]), 1, (0, 0, 255), 2)cv2.imshow("img",draw)key = cv2.waitKey(0)''' '''if(results is not None and len(results[0])==1):try:b=results[0][0]imageok=img[int(b[1])-50:int(b[0])+50,int(b[3])-50:int(b[2])+50]if imageok.shape[0]<100 or imageok.shape[1]<100:continuediss=savedir+'\\'+imagelist[index]cv2.imwrite(diss,imageok)# cv2.rectangle(img, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (255, 255, 255))#cv2.imshow("img",imageok)#key = cv2.waitKey(0)except:pass''''''MTCNN:
https://github.com/pangyupo/mxnet_mtcnn_face_detection
Reference:
https://blog.csdn.net/qq_32166627/article/details/60882964
總結
- 上一篇: 浅谈前端开发必备知识点及未来发展方向
- 下一篇: 计算机网络家庭网络设计,为家庭设计一个局