當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Pyppeteer 使用笔记

發布時間：2024/1/23 编程问答 23 豆豆

生活随笔收集整理的這篇文章主要介紹了 Pyppeteer 使用笔记小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

https://mp.weixin.qq.com/s/Iz-DY1UrSfVFRFh5CyHl3Q

Puppeteer 是 Google 基于 Node.js 開發的一個工具，有了它我們可以通過 JavaScript 來控制 Chrome 瀏覽器的一些操作，當然也可以用作網絡爬蟲上，其 API 極其完善，功能非常強大。而 Pyppeteer 又是什么呢？它實際上是 Puppeteer 的 Python 版本的實現，但他不是 Google 開發的，是一位來自于日本的工程師依據 Puppeteer 的一些功能開發出來的非官方版本。

GitHub 地址:?https://github.com/miyakogi/pyppeteer

官方文檔:?https://miyakogi.github.io/pyppeteer/reference.html

基礎用法

import asyncio from pyppeteer import launchasync def main():# headless參數設為False，則變成有頭模式# Pyppeteer支持字典和關鍵字傳參，Puppeteer只支持字典傳參# 指定引擎路徑# exepath = r'C:\Users\Administrator\AppData\Local\pyppeteer\pyppeteer\local-chromium\575458\chrome-win32/chrome.exe'# browser = await launch({'executablePath': exepath, 'headless': False, 'slowMo': 30})browser = await launch(# headless=False,{'headless': False})page = await browser.newPage()# 設置頁面視圖大小await page.setViewport(viewport={'width': 1280, 'height': 800})# 是否啟用JS，enabled設為False，則無渲染效果await page.setJavaScriptEnabled(enabled=True)# 超時間見 1000 毫秒res = await page.goto('https://www.toutiao.com/', options={'timeout': 1000})resp_headers = res.headers # 響應頭resp_status = res.status # 響應狀態# 等待await asyncio.sleep(2)# 第二種方法，在while循環里強行查詢某元素進行等待while not await page.querySelector('.t'):pass# 滾動到頁面底部await page.evaluate('window.scrollBy(0, document.body.scrollHeight)')await asyncio.sleep(2)# 截圖保存圖片await page.screenshot({'path': 'toutiao.png'})# 打印頁面cookiesprint(await page.cookies())""" 打印頁面文本 """# 獲取所有html內容print(await page.content())# 在網頁上執行js腳本dimensions = await page.evaluate(pageFunction='''() => {return {width: document.documentElement.clientWidth, //頁面寬度height: document.documentElement.clientHeight, //頁面高度deviceScaleFactor: window.devicePixelRatio, //像素比 1.0000000149011612}}''', force_expr=False) # force_expr=False 執行的是函數print(dimensions)# 只獲取文本執行js腳本 force_expr 為True 則執行的是表達式content = await page.evaluate(pageFunction='document.body.textContent', force_expr=True)print(content)# 打印當前頁標題print(await page.title())# 抓取新聞內容可以使用xpath表達式"""# Pyppeteer三種解析方式Page.querySelector() # 選擇器Page.querySelectorAll() Page.xpath() # xpath 表達式# 簡寫方式為:Page.J(), Page.JJ(), and Page.Jx()"""element = await page.querySelector(".feed-infinite-wrapper > ul>li"); # 只抓取一個print(element)# 獲取所有文本內容執行jscontent = await page.evaluate('(element) => element.textContent', element)print(content)# elements = await page.xpath('//div[@class="title-box"]/a')elements = await page.querySelectorAll(".title-box a")for item in elements:print(await item.getProperty('textContent'))# <pyppeteer.execution_context.JSHandle object at 0x000002220E7FE518># 獲取文本title_str = await (await item.getProperty('textContent')).jsonValue()# 獲取鏈接title_link = await (await item.getProperty('href')).jsonValue()print(title_str)print(title_link)# 關閉瀏覽器await browser.close()asyncio.get_event_loop().run_until_complete(main()) import asyncio import pyppeteer from collections import namedtupleResponse = namedtuple("rs", "title url html cookies headers history status")async def get_html(url):browser = await pyppeteer.launch(headless=True, args=['--no-sandbox'])page = await browser.newPage()res = await page.goto(url, options={'timeout': 3000})data = await page.content()title = await page.title()resp_cookies = await page.cookies() # cookieresp_headeers = res.headers # 響應頭resp_status = res.status # 響應狀態print(data)print(title)print(resp_headers)print(resp_status)return titleif __name__ == '__main__':url_list = ["https://www.toutiao.com/","http://jandan.net/ooxx/page-8#comments","https://www.12306.cn/index/"]task = [get_html(url) for url in url_list]loop = asyncio.get_event_loop()results = loop.run_until_complete(asyncio.gather(*task))for res in results:print(res)headers = {'date': 'Sun, 28 Apr 2019 06:50:20 GMT','server': 'Cmcc','x-frame-options': 'SAMEORIGIN\nSAMEORIGIN','last-modified': 'Fri, 26 Apr 2019 09:58:09 GMT','accept-ranges': 'bytes','cache-control': 'max-age=43200','expires': 'Sun, 28 Apr 2019 18:50:20 GMT','vary': 'Accept-Encoding,User-Agent','content-encoding': 'gzip','content-length': '19823','content-type': 'text/html','connection': 'Keep-alive','via': '1.1 ID-0314217270751344 uproxy-17'}

模擬輸入

# 模擬輸入賬號密碼 {'delay': rand_int()} 為輸入時間 await page.type('#TPL_username_1', "sadfasdfasdf") await page.type('#TPL_password_1', "123456789", )await page.waitFor(1000) await page.click("#J_SubmitStatic")

使用tkinter獲取頁面高度寬度

def screen_size():"""使用tkinter獲取屏幕大小"""import tkintertk = tkinter.Tk()width = tk.winfo_screenwidth()height = tk.winfo_screenheight()tk.quit()return width, height

爬取京東商城

import requests from bs4 import BautifulSoup from pyppeteer import launch import asynciodef screen_size():"""使用tkinter獲取屏幕大小"""import tkintertk = tkinter.Tk()width = tk.winfo_screenwidth()await page.setViewport(viewport={"width": width, "height": height})await page.setJavaScriptEnabled(enabled=True)await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299')await page.goto(url)# await asyncio.sleep(2)await page.evaluate('window.scrollBy(0, document.body.scrollHeight)')await asyncio.sleep(1)# content = await page.content()li_list = await page.xpath('//*[@id="J_goodsList"]/ul/li')# print(li_list)item_list = []for li in li_list:a = await li.xpath('.//div[@class="p-img"]/a')detail_url = await (await a[0].getProperty("href")).jsonValue()promo_words = await (await a[0].getProperty("title")).jsonValue()a_ = await li.xpath('.//div[@class="p-commit"]/strong/a')p_commit = await (await a_[0].getProperty("textContent")).jsonValue()i = await li.xpath('./div/div[3]/strong/i')price = await (await i[0].getProperty("textContent")).jsonValue()em = await li.xpath('./div/div[4]/a/em')title = await (await em[0].getProperty("textContent")).jsonvalue()item = {"title": title,"detail_url": detail_url,"promo_words": promo_words,'p_commit': p_commit'price': price}item_list.append(item)# print(item)# break# print(content)await page_close(browser)return item_listasync def page_close(browser):for _page in await browser.pages():await _page.close()await browser.close()msg = "手機" url = "https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq={}&cid2=653&cid3=655&page={}"task_list = [] for i in range(1, 6):page = i * 2 - 1url = url.format(msg, msg, page)task_list.append(main(url))loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*task_list)) # print(results, len(results)) for i in results:print(i, len(i))# soup = BeautifulSoup(content, 'lxml') # div = soup.find('div', id='J_goodsList') # for i, li in enumerate(div.find_all('li', class_='gl-item')): # if li.select('.p-img a'): # print(li.select('.p-img a')[0]['href'], i) # print(li.select('.p-price i')[0].get_text(), i) # print(li.select('.p-name em')[0].text, i) # else: # print("#" * 200) # print(li)

抓取淘寶網

taobao.py

import asyncio import time from pyppeteer.launcher import launch from alifunc import mouse_slide, input_time_random from exe_js import js1, js3, js4, js5def screen_size():"""使用tkinter獲取屏幕大小"""import tkintertk = tkinter.Tk()width = tk.winfo_screenwidth()height = tk.winfo_screenheight()tk.quit()return width, heightasync def main(username, pwd, url):browser = await launch({'headless': False, 'args': ['--no-sandbox'], }, userDataDir='./userdata',args=['--window-size=1366,768'])page = await browser.newPage()width, height = screen_size()await page.setViewport(viewport={"width": width, "height": height})await page.setUserAgent ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299')await page.goto(url)await page.evaluate(js1)await page.evaluate(js3)await page.evaluate(js4)await page.evaluate(js5)pwd_login = await page.querySelector('.J_Quick2Static')# print(await (await pwd_login.getProperty('textContent')).jsonValue())await pwd_login.click()await page.type('#TPL_username_1', username, {'delay': input_time_random() - 50})await page.type('#TPL_password_1', pwd, {'delay': input_time_random()})await page.screenshot({'path': './headless-test-result.png'})time.sleep(2)slider = await page.Jeval('#nocaptcha', 'node => node.style') # 是否有滑塊if slider:print('出現滑塊情況判定')await page.screenshot({'path': './headless-login-slide.png'})flag = await mouse_slide(page=page)if flag:print(page.url)await page.keyboard.press('Enter')await get_cookie(page)else:await page.keyboard.press('Enter')await page.waitFor(20)await page.waitForNavigation()try:global errorerror = await page.Jeval('.error', 'node => node.textContent')except Exception as e:error = Noneprint(e, "錯啦")finally:if error:print('確保賬戶安全重新入輸入')else:print(page.url)# 可繼續網頁跳轉已經攜帶 cookie# await get_search(page)await get_cookie(page)await def page_close(browser):async def page_close(browser):for _page in await browser.pages():await _page.close()await browser.close()async def get_search(page):# https://s.taobao.com/search?q={查詢的條件}&p4ppushleft=1%2C48&s={每頁 44 條第一頁 0 第二頁 44}&sort=sale-descawait asyncio.sleep(5)# print(await page.content())# 獲取登錄后cookie async def get_cookie(page):res = await page.content()cookies_list = await page.cookies()cookies = ''for cookie in cookies_list:str_cookie = '{0}={1};'str_cookie = str_cookie.format(cookie.get('name'), cookie.get('value'))cookies += str_cookieprint(cookies)# 將cookie放入cookie池以便多次請求封賬號利用cooie 對搜索內容進行爬取return cookiesif __name__ == '__main__':username = 'username'pwd = 'password'url = "https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.754894437.1.5af911d9qqVAb1&f=top&redirectURL=https%3A%2F%2Fwww.taobao.com%2F"loop = asyncio.get_event_loop()loop.run_until_complete(main(username, pwd, url))

exe_js.py

js1 = '''() => {Object.defineProperties(navigator,{webdriver:{get: () => false}})}'''js2 = '''() => {alert (window.navigator.webdriver)}'''js3 = '''() => {window.navigator.chrome = {runtime: {},// etc.};}'''js4 = '''() =>{ Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});}'''js5 = '''() =>{ Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5,6],});}'''

alifunc.py

from retrying import retry # 錯誤自動重試 import time, asyncio, randomdef retry_if_result_none(result):return result is None@retry(retry_on_result=retry_if_result_none, ) async def mouse_slide(page=None):await asyncio.sleep(3)try:await page.hover('#nc_1_n1z')await page.mouse.down()await page.mouse.move(2000, 0, {'delay': random.randint(1000, 2000)})await page.mouse.up()except Exception as e:print(e, ' :slide login False')return Noneelse:await asyncio.sleep(3)slider_again = await page.Jeval('.nc-lang-cnt', 'node => node.textContent')if slider_again != '驗證通過':return Noneelse:await page.screenshot({'path': './headless-slide-result.png'})print('驗證通過')return 1def input_time_random():return random.randint(100, 151)

利用獲取到的cookie 爬取搜索內容

import json import requests import re# 設置 cookie 池隨機發送請求通過 pyppeteer 獲取 cookie cookie = '_tb_token_=edd7e354dee53;t=fed8f4ca1946ca1e73223cfae04bc589;sg=20f;cna=2uJSFdQGmDMCAbfFWXWAC4Jv;cookie2=1db6cd63ad358170ea13319f7a862c33;_l_g_=Ug%3D%3D;v=0;unb=3150916610;skt=49cbfd5e01d1b550;cookie1=BxVRmD3sh19TaAU6lH88bHw5oq%2BgcAGcRe229Hj5DTA%3D;csg=cf45a9e2;uc3=vt3=F8dByEazRMnQZDe%2F9qI%3D&id2=UNGTqfZ61Z3rsA%3D%3D&nk2=oicxO%2BHX4Pg%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D;existShop=MTU1Njg3MDM3MA%3D%3D;tracknick=%5Cu7433150322;lgc=%5Cu7433150322;_cc_=V32FPkk%2Fhw%3D%3D;mt=ci=86_1;dnk=%5Cu7433150322;_nk_=%5Cu7433150322;cookie17=UNGTqfZ61Z3rsA%3D%3D;tg=0;enc=tThHs6Sn3BAl8v1fu3J4tMpgzA1n%2BLzxjib0vDAtGsXJCb4hqQZ7Z9fHIzsN0WghdcKEsoeKz6mBwPUpyzLOZw%3D%3D;JSESSIONID=B3F383B3467EC60F8CA425935232D395;l=bBMspAhrveV5732DBOCanurza77OSIRYYuPzaNbMi_5pm6T_G4QOlC03xF96VjfRswYBqh6Mygv9-etuZ;hng=CN%7Czh-CN%7CCNY%7C156;isg=BLi41Q8PENDal3xUVsA-aPbfiWaKiRzB6vcTu_IpBPOmDVj3mjHsO86vxUQYW9SD;uc1=cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=W5iHLLyFeYZ1WM9hVnmS&cookie15=UIHiLt3xD8xYTw%3D%3D&existShop=false&pas=0&cookie14=UoTZ4ttqLhxJww%3D%3D&tag=8&lng=zh_CN;thw=cn;x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0;swfstore=34617;'headers = {'cookie': cookie,"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" }rep = requests.get('https://s.taobao.com/search?q=手機&p4ppushleft=1%2C48&s=0&sort=sale-desc ', headers=headers) rep.encoding = 'utf-8' res = rep.text print(res)r = re.compile(r'g_page_config = (.*?)g_srp_loadCss', re.S) res = r.findall(res)data = res[0].strip().rstrip(';') dic_data = json.loads(data) auctions = dic_data.get('mods')['itemlist']['data']['auctions']# print(auctions,len(auctions)) for item in auctions[1:]:print(item)break

針對iframe 的操作

page.frames獲取所有的iframe列表，需要判斷操作的是哪一個iframe跟操作page一樣操作

from pyppeteer import launch import asyncioasync def main(url):w = await launch({'headless': False, 'args': ['--no-sandbox'], })page = await w.newPage()await page.setViewport({"width": 1366, 'height': 800})await page.goto(url)try:await asyncio.sleep(1)frame = page.framesprint(frame) # 需要找到是哪一個frametitle = await frame[1].title()print(title)await asyncio.sleep(1)login = await frame[1].querySelector('#switcher_plogin')print(login)await login.click()await asyncio.sleep(20)except Exception as e:print(e, "EEEEEEEEE")for _page in await w.pages():await _page.close()await w.close()asyncio.get_event_loop().run_until_complete(main("https://i.qq.com/?rd=1")) # asyncio.get_event_loop().run_until_complete(main("https://www.gushici.com/"))

創作挑戰賽新人創作獎勵來咯，堅持創作打卡瓜分現金大獎

總結

以上是生活随笔為你收集整理的Pyppeteer 使用笔记的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： Burpsuite技巧之MD5加密密码爆
下一篇： GO恶意样本实例分析