日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程语言 > python >内容正文

python

python 爬虫爬取小说信息

發布時間:2023/12/15 python 27 豆豆
生活随笔 收集整理的這篇文章主要介紹了 python 爬虫爬取小说信息 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

1.進入小說主頁(以下示例是我在網上隨便找的一片小說),獲取該小說的名稱、作者以及相關描述信息

2.獲取該小說的所有章節列表信息(最重要的是每個章節的鏈接地址href)

3.根據每個章節的地址信息下載每個章節的內容并解析

4.將解析后的內容打印并寫入文件或者數據庫

?

示例代碼v1版如下(僅供學習交流):

# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author? : xiaofeng @Time? ? : 2018/12/26 11:41 @Desc : Less interests,More interest. @Project : python_appliction @FileName: dianxs.py @Software: PyCharm @Blog? ? :https://blog.csdn.net/zwx19921215 """import requests import time from lxml import html import os""" 簡單爬取小說文章內容('殿行說小說網') """class Dianxs():# 構造函數初始化def __init__(self, host, url, headers, path):self.host = hostself.url = urlself.headers = headersself.path = path"""下載并解析小說主頁信息,獲取小說列表"""def download_page(self):response = requests.get(url=self.url, headers=self.headers)text = html.fromstring(response.text)novel = text.xpath('//div[@class="info"]/h1/text()')author = text.xpath('//div[@class="info"]/p[1]/text()')act = text.xpath('//div[@class="info"]/p[2]/text()')last_update = text.xpath('//div[@class="info"]/p[3]/text()')last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')print('---------------------------description--------------------------------')print(novel)# author[0].replace('\xa0', '')print(author)print(act)print(last_update)print(last_chapter_title, ' , ', last_chapter_href)print('簡介:', introduce)print('-----------------------------------------------------------------------')print('\n')chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')print(chapters)print(hrefs)print('\n')for href in hrefs:time.sleep(1)address = self.host + hrefself.parse_html(address)"""解析文章內容@:param address 章節頁地址"""def parse_html(self, address):response = requests.get(url=address, headers=self.headers, timeout=10)if response.status_code != 200:self.parse_html(address)text = html.fromstring(response.text)title = text.xpath('//div[@class="read-title"]/h2/text()')content = text.xpath('//div[@class="read-content"]/p/text()')print('-------- ', title, '-----------')print(content)print('\n')# ''.join(content):list集合轉字符串;list = list(string):字符串轉list集合title_str = ''.join(title)content_str = ''.join(content)self.write_to_file(title_str, content_str)"""章節內容寫入文件"""def write_to_file(self, title, content):flag = os.path.exists(self.path)if not flag:# 'w' 表示寫模式,沒有文件則會創建一個f = open(self.path, 'w')f.close()# with 可以不用顯示調用close方法# 'a' 表示追加寫入with open(self.path, mode='a', encoding='utf-8') as file:file.write(title + '\n')file.writelines(content)file.write('\n\n')if __name__ == '__main__':host = 'https://www.dianxs.com'url = 'https://www.dianxs.com/book/64554/'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}path = 'G:/test/novel.txt'app = Dianxs(host, url, headers, path)app.download_page()

?

控制臺輸出:

?

文件寫入內容:

?

v2版:從網站主頁開始深度優先爬取所有分類欄目以及欄目下所有小說信息

改進v2版示例如下

# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author? : xiaofeng @Time? ? : 2018/12/26 11:41 @Desc : Less interests,More interest. @Project : python_appliction @FileName: dianxs2.py @Software: PyCharm @Blog? ? :https://blog.csdn.net/zwx19921215 """import osimport requests from lxml import html""" 簡單爬取小說文章內容('殿行說小說網') """class Dianxs():# 構造函數初始化def __init__(self, host, url, headers, path):self.host = hostself.url = urlself.headers = headersself.path = pathself.novel_name = ''"""主站導航欄欄目列表爬取"""def nav_page(self):print('------------------殿行說----------------------------')response = requests.get(url=self.host, headers=self.headers)text = html.fromstring(response.text)nav_list = text.xpath('//ul[@class="nav"]/li/a/text()')nav_href_list = text.xpath('//ul[@class="nav"]/li/a/@href')nav_list.pop(0)nav_href_list.pop(0)print(nav_list)print(nav_href_list)i = 0for nav_item in nav_href_list:address = self.host + nav_itemnav_title = nav_list[i]self.nav_item(address, nav_title)i += 1"""小說欄目下所有章節爬取"""def nav_item(self, url, nav_title):response = requests.get(url=url, headers=self.headers)text = html.fromstring(response.text)novel_list = text.xpath('//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/text()')novel_list_href = text.xpath('//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/@href')print('--------------------', nav_title, '-----------------')print(novel_list)print(novel_list_href)print('\n')for nov_item in novel_list_href:self.url = self.host + nov_itemself.download_page()"""每個章節下載,并解析小說主頁信息,獲取小說列表"""def download_page(self):response = requests.get(url=self.url, headers=self.headers)text = html.fromstring(response.text)novel = text.xpath('//div[@class="info"]/h1/text()')author = text.xpath('//div[@class="info"]/p[1]/text()')act = text.xpath('//div[@class="info"]/p[2]/text()')last_update = text.xpath('//div[@class="info"]/p[3]/text()')last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')print('---------------------------description--------------------------------')print(novel)# author[0].replace('\xa0', '')print(author)print(act)print(last_update)print(last_chapter_title, ' , ', last_chapter_href)print('簡介:', introduce)print('-----------------------------------------------------------------------')print('\n')chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')print(chapters)print(hrefs)print('\n')for href in hrefs:# time.sleep(1)address = self.host + hrefself.novel_name = novelself.parse_html(address)"""解析文章內容@:param address 章節頁地址"""def parse_html(self, address):response = requests.get(url=address, headers=self.headers, timeout=10)if response.status_code != 200:self.parse_html(address)text = html.fromstring(response.text)title = text.xpath('//div[@class="read-title"]/h2/text()')content = text.xpath('//div[@class="read-content"]/p/text()')print('-------- ', title, '-----------')print(content)print('\n')# ''.join(content):list集合轉字符串;list = list(string):字符串轉list集合title_str = ''.join(title)content_str = ''.join(content)self.write_to_file(title_str, content_str)"""章節內容寫入文件"""def write_to_file(self, title, content):file_path = self.path + ''.join(self.novel_name)if not os.path.exists(file_path):os.makedirs(file_path)file_name = file_path + '/' + title + '.txt'flag = os.path.exists(file_name)if not flag:# 'w' 表示寫模式,沒有文件則會創建一個try:f = open(file_name, 'w')f.close()except Exception as e:print(e)# 可將發生異常信息的章節收集,進行后續處理# todo# with 可以不用顯示調用close方法# 'a' 表示追加寫入with open(file_name, mode='a', encoding='utf-8') as file:file.write(title + '\n')file.writelines(content)file.write('\n\n')if __name__ == '__main__':host = 'https://www.dianxs.com'url = 'https://www.dianxs.com/book/64554/'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}path = 'G:/殿興說/'app = Dianxs(host, url, headers, path)app.nav_page()# app.download_page()

總結

以上是生活随笔為你收集整理的python 爬虫爬取小说信息的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。