當前位置：首頁 > 编程语言 > python >内容正文

python

python 携程_python 携程爬虫开发笔记

發布時間：2025/3/15 python 31 豆豆

生活随笔收集整理的這篇文章主要介紹了 python 携程_python 携程爬虫开发笔记小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

前言

最近購買了《Python3 爬蟲、數據清洗與可視化實戰》，剛好適逢暑假，就嘗試從攜程頁面對廣州的周邊游產品進行爬蟲數據捕捉。

因為才學Python不夠一個星期，python的命名規范還是不太了解，只能套用之前iOS開發的命名規范，有不足之處請多多指點

一、前期

1.主要用到的庫

from bs4 import BeautifulSoup

import time

import re #正則表達式

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains #瀏覽器操作

import xlrd

import xlwt

from xlutils.copy import copy

import os

BeautifulSoup：用于對標簽等數據進行定位和抓取

selenium：用于啟動瀏覽器和對頁面進行自動操作

time：暫停等待操作

xlrd、xlwt、xlutils：對數據結果進行Excel讀寫保存操作

2.核心思路

1，跳進出發點的周邊游頁面(廣州)

2，在首頁捕捉推薦的熱門目的地和熱點景點，進行保存

3，針對目的地地點進行遍歷搜索所展示的旅游產品

4，產品數據參數抓取

5，數據保存

6，退出瀏覽器

二、代碼

1.啟動瀏覽器

def setupDriverSetting():

global driver

# url = 'http://m.ctrip.com/restapi/soa2/10290/createclientid?systemcode=09&createtype=3&conte'#獲取cookieID

# 手機端

# url = 'https://m.ctrip.com/webapp/vacations/tour/list?tab=64&kwd=%E7%8F%A0%E6%B5%B7&salecity=32&searchtype=tour&sctiy=32'

# 電腦端

url = 'https://weekend.ctrip.com/around/'

# 設置用chrome啟動

driver = webdriver.Chrome()

# #設置fireFox請求頭參數

# profile = webdriver.FirefoxProfile()

# user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"

# profile.set_preference("general.useragent.override",user_agent)

# driver = webdriver.Firefox(profile)

driver.get(url)

用webdriver啟動Chrome或者fireFox，并跳進首頁URL

2.選擇出發點城市

def select_StartPlace(startPlace):

#點擊出發點view

driver.find_element_by_xpath("//*[@id='CitySelect']").click()

#選擇出發點

cityList = driver.find_elements_by_xpath("//*[@id='CitySelect']/dd/ul")

for link in cityList:

links = link.find_elements(By.TAG_NAME,"a")

for eachCity in links:

cityStr = eachCity.text

if cityStr == startPlace:

print("找到目標城市:"+eachCity.get_attribute('href'))

driver.get(eachCity.get_attribute('href'))

time.sleep(2)

try:

WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='SearchText']")))

except:

print('出發地頁面加載不成功')

break

主要是用find_element_by_xpath尋找目標城市進行選擇篩選，然后跳到城市專頁

3.搜索目的地

def finAllDestinationPage():

#查找總數組

destType = driver.find_element_by_id("J_sub_circum")#id 決定產品范圍(周邊游，境外游)

print(destType.text)

destType1 = destType.find_element_by_class_name("side_jmp_dest")

destTypeItem = destType1.get_attribute('innerHTML')

item = BeautifulSoup(destTypeItem,'lxml')

destTypeList = item.find_all('li')

allDestinationListDic = {}

for each in destTypeList:

typeName = each.h4.string

typeList = each.find_all('a')

list = []

for i in typeList:

list.append(i.string)

allDestinationListDic[typeName] = list

return allDestinationListDic

搜索所有可推薦目的地和景點，并用字典保存

4.旅游產品列表頁

def jump_destinationPage(startPlace,destination):

#定位搜索欄

try:

WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='SearchText']")))

except:

print('查找不到搜索欄')

finally:

print('本地頁面加載完畢')

driver.find_element_by_xpath("//input[@id='SearchText']").send_keys(destination)

print("輸入目的地："+destination)

driver.find_element_by_xpath("//*[@id='SearchBtn']").click()

print("點擊搜索按鈕結束")

time.sleep(2)

try:

WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='js-dpSearcher']")))

except:

print('產品列表頁加載不成功')

finally:

print('產品列表頁加載完畢')

#再選一次出發地，以防出錯

reSelect_StartPlace(startPlace)

#搜索頁數

pageHtml = driver.find_element_by_xpath("//*[@id='_sort']/div/span")

print(pageHtml.text)

pageNumStr = pageHtml.text

pageNumStr = pageNumStr[:-1]

print("獲取的num:" + pageNumStr)

#正則表達式查找頁數

pageNumS = re.findall(r'\d+',pageNumStr)

pageNum = int(pageNumS[1])

print(pageNum)

tourProductList = []

for i in range(0,pageNum):

itemList = showCurrentPageAllData()

#收集數據

for j in range(0,len(itemList)):

eachItem = collectCurrentPageEachData(j)

tourProductList.append(eachItem)

#點擊下一頁

driver.find_element_by_xpath("//input[@id='ipt_page_txt']").clear()

driver.find_element_by_xpath("//input[@id='ipt_page_txt']").send_keys(str(i+2))

driver.find_element_by_xpath("//*[@id='ipt_page_btn']").click()

print("點擊下一頁結束->"+str(i+2)+"頁")

time.sleep(2)

return driver

跳進產品頁，并根據標簽，抓取總頁數，在遍歷所有旅游產品后，再跳到下一頁進行循環遍歷

5.產品數據抓取

def collectCurrentPageEachData(itemNum):

itemList = driver.find_elements_by_class_name("product_box")

str = itemList[itemNum].get_attribute('innerHTML')#轉換成字符串

# item = BeautifulSoup(str,"html.parser")#獲取item的soup對象

item = BeautifulSoup(str, "lxml") # 獲取item的soup對象

# print("+++++++"+item.prettify())

# 解析

#產品名稱

titleNameHtml = item.find('h2',class_= 'product_title')

print("-------"+titleNameHtml.get_text())

productName = titleNameHtml.get_text()

#產品鏈接

productLink = titleNameHtml.a['href']

productLink = productLink[2:]

productLink = "https://"+productLink

print("link:" + productLink)

#產品類型

productType = item.find('em')

print("type:"+productType.get_text())

productTypeStr = productType.get_text()

#產品價格

priceHtml = item.find('span',class_='sr_price')

priceStr = priceHtml.strong.get_text()

#判斷是否為數字

if priceStr.isdigit() == True :

priceStr = "%.2f"%float(priceStr)

print("price:"+priceStr)

#產品供應商

productRetail = item.find('p',class_='product_retail')

productRetailStr = productRetail['title']

if "供應商" in productRetailStr:

productRetailStr = productRetailStr[4:]

print("retail:" + productRetailStr)

#產品評分

try :

gradeHtml = item.find('p', class_='grade')

gradeStr = gradeHtml.strong.get_text()

print("grade:" + gradeStr)

except:

print('查找不到評分')

gradeStr = ''

# 產品人數

try:

commentHtml = item.find('div', class_='comment')

commentStr = commentHtml.em.get_text()

commentNumS = re.findall(r'\d+', commentStr)

commentNum = int(commentNumS[0])

print("comment:",commentNum)

except:

print('查找不到出游人數')

commentNum = ''

return {

'名稱':productName,

'鏈接':productLink,

'類型':productTypeStr,

'價格':priceStr,

'供應商':productRetailStr,

'評分':gradeStr,

'人數':commentNum,

}

在產品頁面上獲取所有可見信息，并返回

6.數據保存

class ExcelFileManager:

def creatExcelFile(fileName,sheetName,headRowList):

# 獲取項目所在目錄

filePath = os.getcwd() + '/' + fileName + '.xls'

#如果不存在就新增

try:

oldFile = xlrd.open_workbook(filePath)

file = copy(oldFile)

except:

file = xlwt.Workbook()

print("新建文件")

#如果不存在就新增

try:

sheet1 = file.add_sheet(sheetName,cell_overwrite_ok=True)

except:

sheet1 = file.get_sheet(sheetName)

#設置style樣式

head_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00')

row0 = headRowList

for i in range(0,len(row0)):

sheet1.write(0,i,row0[i],head_style)

print(filePath)

file.save(filePath)

def addDataToExcelFile(fileName,sheetName,dataList):

filePath = os.getcwd()+'/'+fileName+'.xls'

file = xlrd.open_workbook(filePath)

#已存在的行數

newRows = file.sheet_by_name(sheetName).nrows

new_File = copy(file)

sheet = new_File.get_sheet(sheetName)

try:

for i in range(0,len(dataList)):

for j in range(0,len(dataList[i])):

sheet.write(i+newRows,j,dataList[i][j])

except Exception as e:

print(e)

new_File.save(filePath)

Excel文件創建與保存數據，不得不說，python對Excel支持不是很友好，xlrd和xlwt僅支持讀和寫，不支持增加sheet或者在原有Excel文件上添加數據等操作，需要用到第三方庫

三、抓取結果：

1530848043475.jpg

總結

以上是生活随笔為你收集整理的python 携程_python 携程爬虫开发笔记的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： STM32H743+CubeMX-QSP
下一篇： c++ 计算正弦的近似值_一篇文章搞懂正