當前位置：首頁 > 编程语言 > python >内容正文

python

python爬取国内代理ip_python爬虫实战：爬取西刺代理的代理ip（二）

發布時間：2023/12/10 python 27 豆豆

生活随笔收集整理的這篇文章主要介紹了 python爬取国内代理ip_python爬虫实战：爬取西刺代理的代理ip（二）小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

爬蟲實戰(二)：爬取西刺代理的代理ip

對于剛入門的同學來說，本次實戰稍微有點難度，但是簡單的爬取圖片、文本之類的又沒營養，所以這次我選擇了爬取西刺代理的ip地址，爬取的代理ip也能在以后的學習中用到

本次實戰用的主要知識很多，其中包括：

requests.Session()自動保存cookie

利用抓包工具獲取cookie；

BeautifulSoup和xpath匹配html文檔中的標簽

subprocess測試ip并獲取運行時間及匹配的丟包數

代碼如下：

"""

案例名稱：學習使用ip代理池

需求：從網上找一個代理ip的網站，然后獲取網站上的

100個ip，組成代理ip池，然后隨機抽取其中一個ip，

并對該ip進行連通性測試，如果該ip可以，我們可以將

該ip作為代理ip來使用

思路：

1，先獲取西刺代理網站上的ip(100)

2, 隨機抽取其中一個ip，并檢測其連通性

3，如果該ip可用，則可以作為代理ip使用

編碼：

測試：

"""

import requests

from bs4 import BeautifulSoup

from lxml import etree

import subprocess as sp

import random

import re

"""

函數說明:獲取代理ip網站的ip

"""

def get_proxys(page):

#requests的Session()可以自動保存cookie，

#不需要自己維護cookie內容

S = requests.Session()

#目標網址的url

target_url = 'http://www.xicidaili.com/nn/%d' %page

target_headers = {

'Upgrade-Insecure-Requests': '1',

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Referer': 'http://www.xicidaili.com/nn/',

'Accept-Encoding': 'gzip, deflate, sdch',

'Accept-Language': 'zh-CN,zh;q=0.8'

}

target_response = S.get(url=target_url,

headers=target_headers)

target_response.encoding = 'utf-8'

target_html = target_response.text

# print(target_html)

#解析數據(ip,port,protocol)

bf1_ip_list = BeautifulSoup(target_html,'lxml')

bf2_ip_list = BeautifulSoup(str(bf1_ip_list.find_all(id='ip_list')),'lxml')

ip_list_info = bf2_ip_list.table.contents

proxys_list = []

for index in range(len(ip_list_info)):

if index % 2 == 1 and index != 1:

dom = etree.HTML(str(ip_list_info[index]))

ip = dom.xpath('//td[2]')

port = dom.xpath('//td[3]')

protocol = dom.xpath('//td[6]')

proxys_list.append(protocol[0].text.lower()

+ "#" + ip[0].text

+ "#" + port[0].text)

return proxys_list

"""

函數說明:檢測代理ip的連通性

參數:

ip--代理的ip地址

lose_time--匹配的丟包數

waste_time--匹配平均時間

返回值:

average_time--代理ip的平均耗時

"""

def check_ip(ip, lose_time, waste_time):

cmd = "ping -n 3 -w 3 %s"

#執行命令

p = sp.Popen(cmd %ip, stdin=sp.PIPE,

stdout=sp.PIPE,

stderr=sp.PIPE,

shell=True)

#獲取返回結果并解碼

out = p.stdout.read().decode('GBK')

lose_time = lose_time.findall(out)

if len(lose_time) == 0:

lose = 3

else:

lose = int(lose_time[0])

#如果丟包數大于2，那么我們返回平均耗時1000

if lose > 2:

#返回false(1000)

return 1000

else:

#平均時間

average = waste_time.findall(out)

if len(average) == 0:

return 1000

else:

average_time = int(average[0])

#返回平均耗時

return average_time

"""

函數說明:初始化正則表達式

返回值:

lose_time--匹配丟包數

waste_time--匹配平均時間

"""

def initpattern():

#匹配丟包數

lose_time = re.compile(u"丟失 = (\d+)",re.IGNORECASE)

#匹配平均時間

waste_time = re.compile(u"平均 = (\d+)ms",re.IGNORECASE)

return lose_time, waste_time

if __name__ == '__main__':

#初始化正則表達式

lose_time, waste_time = initpattern()

#獲取ip代理

proxys_list = get_proxys(1)

#如果平均時間超過200ms，則重新選取ip

while True:

#從100個ip中隨機選取一個ip作為代理進行網絡訪問

proxy = random.choice(proxys_list)

split_proxy = proxy.split('#')

#獲取ip

ip = split_proxy[1]

#檢查ip

average_time = check_ip(ip, lose_time, waste_time)

if average_time > 200:

#去掉不能使用的ip

proxys_list.remove(proxy)

print("ip鏈接超時，重新獲取中...")

else:

break

proxys_list.remove(proxy)

proxys_dict = {split_proxy[0]:split_proxy[1]

+ ":" + split_proxy[2]}

print("使用代理:", proxys_dict)

今天的代碼有點難以理解，但是要按照代碼步驟及規范理解起來并不難，小伙伴們加油，我自己也加油！

總結

以上是生活随笔為你收集整理的python爬取国内代理ip_python爬虫实战：爬取西刺代理的代理ip（二）的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：多个圆点，鼠标选取两个，求两个点的距离，
下一篇：还不会财务管理分析？Python爬取全网