利用python脚本(re)抓取美空mm图片
生活随笔
收集整理的這篇文章主要介紹了
利用python脚本(re)抓取美空mm图片
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
很久沒(méi)有寫(xiě)博客了,這段時(shí)間一直在搞風(fēng)控的東西,過(guò)段時(shí)間我把風(fēng)控的內(nèi)容整理整理發(fā)出來(lái)大家一起研究研究。
這兩天抽空寫(xiě)了兩個(gè)python爬蟲(chóng)腳本,一個(gè)使用re,一個(gè)使用xpath。
直接上代碼——基于re:
spider.py
# -*- coding:utf-8 -*-import urllib.request import re import tool import os import http.cookiejar# 抓取MOKO_MM class Spider:# 頁(yè)面初始化def __init__(self):self.siteURL = 'http://www.moko.cc/focus|list.action'self.tool = tool.Tool()# 獲取索引頁(yè)面的內(nèi)容def getPage(self, pageIndex):url = self.siteURL + "?type=4&curPage=" + str(pageIndex)request = urllib.request.Request(url)response = urllib.request.urlopen(request)return response.read().decode('utf-8')# 獲取索引界面所有MM的信息,list格式def getContents(self, pageIndex):page = self.getPage(pageIndex)pattern = re.compile('<div class="subMainContent".*?<a href="(.*?)".*?subFocus-07.*?<img src="(.*?)".*?subFocus-08.*?<h1>(.*?)</h1>',re.S)items = re.findall(pattern, page)# item[0] 詳情頁(yè)鏈接# item[1] 縮略圖# item[2] 標(biāo)題contents = []for item in items:contents.append([item[0], item[1], item[2].replace(" ", "-").replace("|", "-").replace(".", "-").replace(":", "-")])return contents# 獲取MM個(gè)人詳情頁(yè)面def getDetailPage(self, infoURL):def makeMyOpener(head={'accept-encoding': 'deflate, sdch','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Language': 'zh-CN,zh;q=0.8','Cookie': 'JSESSIONID=58C82905AD36B5DFA8D4F1C98A2559DC; Hm_lvt_8d82e75c6168ba4bc0135a08edae2a2e=1488505496; Hm_lpvt_8d82e75c6168ba4bc0135a08edae2a2e','Referer': 'https://mm.taobao.com/687471686.htm','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}):cookie = http.cookiejar.CookieJar()opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))header = []for key, value in head.items():elem = (key, value)header.append(elem)opener.addheaders = headerreturn openeroper = makeMyOpener()uop = oper.open(infoURL)data = uop.read().decode('utf-8')return data# response = urllib.request.urlopen(infoURL)# return response.read().decode('utf-8')# 獲取個(gè)人文字簡(jiǎn)介def getBrief(self, page):pattern = re.compile('<div class="infoShow-12".*?<p align="center".*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>',re.S)items = re.findall(pattern, page)# item[0] 主頁(yè)地址# item[1] 頭像# item[2] 姓名contents = []for item in items:contents.append([item[0], item[1], item[2]])return contents# result = re.search(pattern, page)# print(result.group())# return self.tool.replace(result.group(1))# 獲取頁(yè)面所有圖片def getAllImg(self, page):pattern = re.compile('<div class="infoShow-12">(.*?)<div class="infoShow-13">', re.S)# 個(gè)人信息頁(yè)面所有代碼content = re.search(pattern, page)# 從代碼中提取圖片patternImg = re.compile('<img.*?src="(.*?)"', re.S)images = re.findall(patternImg, content.group(1))return images# 保存多張寫(xiě)真圖片def saveImgs(self, images, name):number = 1print(u"發(fā)現(xiàn)", name, u"共有", len(images), u"張照片")for imageURL in images:splitPath = imageURL.split('.')splitPath = splitPathfTail = splitPath.pop()if len(fTail) > 3:fTail = "jpg"fileName = name + "/" + str(number) + "." + fTailself.saveImg(imageURL, fileName)number += 1# 保存頭像def saveIcon(self, iconURL, name):splitPath = iconURL.split('.')fTail = splitPath.pop()fileName = name + "/icon." + fTailself.saveImg(iconURL, fileName)# 保存?zhèn)€人簡(jiǎn)介def saveBrief(self, content, name):fileName = name + "/" + name + ".txt"f = open(fileName, "w+")print(u"正在保存信息為", fileName)# f.write(content.decode('utf-8')) f.write(content)# 保存圖片地址頁(yè)到各文件夾中def saveToLocal(self, Li, name):fileName = name + "/" + "urlPage.txt"print(u"正在保存圖片地址頁(yè):", fileName)# f.write(content.decode('utf-8'))# pre=pre.replace("[","")# pre=pre.replace("]","")+"\n"# print (pre)f = open(fileName, "w")f.write(Li)f.close()# 追加方式寫(xiě)入當(dāng)前爬行的名字,后續(xù)調(diào)用content = name + " "with open('url.txt', 'a') as url:url.write(content)url.close()print(name + u"追加完成!\n")# 傳入圖片地址,文件名,保存單張圖片def saveImg(self, imageURL, fileName):try:u = urllib.request.urlopen(imageURL)data = u.read()f = open(fileName, 'wb')f.write(data)print(u"正在保存的一張圖片為", fileName)f.close()except urllib.request.URLError as e:print(e.reason)# 創(chuàng)建新目錄def mkdir(self, path):path = path.strip()# 判斷路徑是否存在# 存在 True# 不存在 FalseisExists = os.path.exists(path)# 判斷結(jié)果if not isExists:# 如果不存在則創(chuàng)建目錄print(u"新建了名字叫做", path, u'的文件夾')# 創(chuàng)建目錄操作函數(shù) os.makedirs(path)return Trueelse:# 如果目錄存在則不創(chuàng)建,并提示目錄已存在print(u"名為", path, '的文件夾已經(jīng)創(chuàng)建成功')return False# 將一頁(yè)MOKO MM的信息保存起來(lái)def savePageInfo(self, pageIndex):# 獲取第一頁(yè)MOKO MM列表contents = self.getContents(pageIndex)for item in contents:# item[0]個(gè)人詳情URL,item[1]頭像URL,item[2]姓名print(u"發(fā)現(xiàn)一位名叫", item[2], u"的信息")print(u"正在保存", item[2], "的信息")print(u"個(gè)人詳情地址是", "http://www.moko.cc" + str(item[0]))# 個(gè)人詳情頁(yè)面的URLdetailURL = "http://www.moko.cc" + str(item[0])# 得到個(gè)人詳情頁(yè)面代碼detailPage = self.getDetailPage(detailURL)# 獲取個(gè)人簡(jiǎn)介# brief = self.getBrief(detailPage)# 獲取所有圖片列表self.mkdir(item[2])images = self.getAllImg(detailPage)spider.saveImgs(images, item[2])# 保存?zhèn)€人簡(jiǎn)介# self.saveBrief(brief.encode('utf-8'), item[2])# self.saveBrief(brief, item[2])# 保存圖片地址頁(yè)到本地# self.saveToLocal(detailPage, item[2])# 保存頭像self.saveIcon("https:" + str(item[1]), item[2])# 刪除舊名單(如果有)def deleteOldTxt(self):filename = 'url.txt'if os.path.exists(filename):os.remove(filename)print("\n發(fā)現(xiàn)舊名單,已刪除\n采集開(kāi)始\n")# 傳入起止頁(yè)碼,獲取MM頁(yè)面保存def savePagesInfo(self, start, end):for i in range(start, end + 1):print(u"正在尋找第", i, u"個(gè)地方")self.savePageInfo(i)# 保存圖片# self.saveImgs(images,item[2])# 讀取名字listdef openNameList(self):with open("url.txt", "r") as f:for line in f:line = line.strip()# line.split(",")# result.append(line)# result.append(line.split(","))# \s匹配空格與tab,\s+表示至少一個(gè)result = re.split(r'\s+', line)return result# 逐個(gè)調(diào)取文件夾下頁(yè)面中地址來(lái)保存def saveAll(self):i = spider.openNameList()for name in i:print("當(dāng)前正在保存的是" + name + "的圖片")filepath = name + "/urlPage.txt"with open(filepath, "r") as urlContent:urlContent = urlContent.read()images = spider.getAllImg(urlContent)spider.saveImgs(images, name)# 傳入起止頁(yè)碼即可,在此傳入了6,10,表示抓取第6到10頁(yè)的MM spider = Spider() spider.deleteOldTxt() spider.savePagesInfo(1, 10)tool.py
# -*- coding:utf-8 -*- import re# 處理頁(yè)面標(biāo)簽類 class Tool:# 去除img標(biāo)簽,1-7位空格,removeImg = re.compile(r'<img.*?>| {1,7}| ')# 刪除超鏈接標(biāo)簽removeAddr = re.compile(r'<a.*?>|</a>')# 把換行的標(biāo)簽換為\nreplaceLine = re.compile(r'<tr>|<div>|</div>|</p>')# 將表格制表<td>替換為\treplaceTD = re.compile(r'<td>')# 將換行符或雙換行符替換為\nreplaceBR = re.compile(r'<br><br>|<br>')# 將其余標(biāo)簽剔除rremoveExtraTag = re.compile(r'<.*?>')# 將多行空行刪除removeNoneLine = re.compile(r'\n+')# 刪除removeSpace = re.compile(r' ')def replace(self, x):x = re.sub(self.removeImg, "", x)x = re.sub(self.removeAddr, "", x)x = re.sub(self.replaceLine, "\n", x)x = re.sub(self.replaceTD, "\t", x)x = re.sub(self.replaceBR, "\n", x)x = re.sub(self.removeExtraTag, "", x)x = re.sub(self.removeNoneLine, "\n", x)x = re.sub(self.removeSpace, "", x)# strip()將前后多余內(nèi)容刪除return x.strip()?
總結(jié)
以上是生活随笔為你收集整理的利用python脚本(re)抓取美空mm图片的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 智能算法的研究与实现
- 下一篇: [python爬虫] Selenium常