當前位置：首頁 > 编程语言 > python >内容正文

python

python 爬虫包_Python爬虫包BeautifulSoup实例（三）

發布時間：2025/3/15 python 27 豆豆

生活随笔收集整理的這篇文章主要介紹了 python 爬虫包_Python爬虫包BeautifulSoup实例（三）小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一步一步構建一個爬蟲實例，抓取糗事百科的段子

先不用beautifulsoup包來進行解析

第一步，訪問網址并抓取源碼

# -*- coding: utf-8 -*-

# @Author: HaonanWu

# @Date: 2016-12-22 16:16:08

# @Last Modified by: HaonanWu

# @Last Modified time: 2016-12-22 20:17:13

import urllib

import urllib2

import re

import os

if __name__ == '__main__':

# 訪問網址并抓取源碼

url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

headers = {'User-Agent':user_agent}

try:

request = urllib2.Request(url = url, headers = headers)

response = urllib2.urlopen(request)

content = response.read()

except urllib2.HTTPError as e:

print e

exit()

except urllib2.URLError as e:

print e

exit()

print content.decode('utf-8')

第二步，利用正則表達式提取信息

首先先觀察源碼中，你需要的內容的位置以及如何識別

然后用正則表達式去識別讀取

注意正則表達式中的 . 是不能匹配\n的，所以需要設置一下匹配模式。

# -*- coding: utf-8 -*-

# @Author: HaonanWu

# @Date: 2016-12-22 16:16:08

# @Last Modified by: HaonanWu

# @Last Modified time: 2016-12-22 20:17:13

import urllib

import urllib2

import re

import os

if __name__ == '__main__':

# 訪問網址并抓取源碼

url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

headers = {'User-Agent':user_agent}

try:

request = urllib2.Request(url = url, headers = headers)

response = urllib2.urlopen(request)

content = response.read()

except urllib2.HTTPError as e:

print e

exit()

except urllib2.URLError as e:

print e

exit()

regex = re.compile('

.*?(.*?).*?', re.S)

items = re.findall(regex, content)

# 提取數據

# 注意換行符，設置 . 能夠匹配換行符

for item in items:

print item

第三步，修正數據并保存到文件中

# -*- coding: utf-8 -*-

# @Author: HaonanWu

# @Date: 2016-12-22 16:16:08

# @Last Modified by: HaonanWu

# @Last Modified time: 2016-12-22 21:41:32

import urllib

import urllib2

import re

import os

if __name__ == '__main__':

# 訪問網址并抓取源碼

url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

headers = {'User-Agent':user_agent}

try:

request = urllib2.Request(url = url, headers = headers)

response = urllib2.urlopen(request)

content = response.read()

except urllib2.HTTPError as e:

print e

exit()

except urllib2.URLError as e:

print e

exit()

regex = re.compile('

.*?(.*?).*?', re.S)

items = re.findall(regex, content)

# 提取數據

# 注意換行符，設置 . 能夠匹配換行符

path = './qiubai'

if not os.path.exists(path):

os.makedirs(path)

count = 1

for item in items:

#整理數據，去掉\n,將
換成\n

item = item.replace('\n', '').replace('
', '\n')

filepath = path + '/' + str(count) + '.txt'

f = open(filepath, 'w')

f.write(item)

f.close()

count += 1

第四步，將多個頁面下的內容都抓取下來

# -*- coding: utf-8 -*-

# @Author: HaonanWu

# @Date: 2016-12-22 16:16:08

# @Last Modified by: HaonanWu

# @Last Modified time: 2016-12-22 20:17:13

import urllib

import urllib2

import re

import os

if __name__ == '__main__':

# 訪問網址并抓取源碼

path = './qiubai'

if not os.path.exists(path):

os.makedirs(path)

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

headers = {'User-Agent':user_agent}

regex = re.compile('

.*?(.*?).*?', re.S)

count = 1

for cnt in range(1, 35):

print '第' + str(cnt) + '輪'

url = 'http://www.qiushibaike.com/textnew/page/' + str(cnt) + '/?s=4941357'

try:

request = urllib2.Request(url = url, headers = headers)

response = urllib2.urlopen(request)

content = response.read()

except urllib2.HTTPError as e:

print e

exit()

except urllib2.URLError as e:

print e

exit()

# print content

# 提取數據

# 注意換行符，設置 . 能夠匹配換行符

items = re.findall(regex, content)

# 保存信息

for item in items:

# print item

#整理數據，去掉\n,將
換成\n

item = item.replace('\n', '').replace('
', '\n')

filepath = path + '/' + str(count) + '.txt'

f = open(filepath, 'w')

f.write(item)

f.close()

count += 1

print '完成'

使用BeautifulSoup對源碼進行解析

# -*- coding: utf-8 -*-

# @Author: HaonanWu

# @Date: 2016-12-22 16:16:08

# @Last Modified by: HaonanWu

# @Last Modified time: 2016-12-22 21:34:02

import urllib

import urllib2

import re

import os

from bs4 import BeautifulSoup

if __name__ == '__main__':

url = 'http://www.qiushibaike.com/textnew/page/1/?s=4941357'

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

headers = {'User-Agent':user_agent}

request = urllib2.Request(url = url, headers = headers)

response = urllib2.urlopen(request)

# print response.read()

soup_packetpage = BeautifulSoup(response, 'lxml')

items = soup_packetpage.find_all("div", class_="content")

for item in items:

try:

content = item.span.string

except AttributeError as e:

print e

exit()

if content:

print content + "\n"

這是用BeautifulSoup去抓取書本以及其價格的代碼

可以通過對比得出到bs4對標簽的讀取以及標簽內容的讀取

(因為我自己也沒有學到這一部分，目前只能依葫蘆畫瓢地寫)

# -*- coding: utf-8 -*-

# @Author: HaonanWu

# @Date: 2016-12-22 20:37:38

# @Last Modified by: HaonanWu

# @Last Modified time: 2016-12-22 21:27:30

import urllib2

import urllib

import re

from bs4 import BeautifulSoup

url = "https://www.packtpub.com/all"

try:

html = urllib2.urlopen(url)

except urllib2.HTTPError as e:

print e

exit()

soup_packtpage = BeautifulSoup(html, 'lxml')

all_book_title = soup_packtpage.find_all("div", class_="book-block-title")

price_regexp = re.compile(u"\s+\$\s\d+\.\d+")

for book_title in all_book_title:

try:

print "Book's name is " + book_title.string.strip()

except AttributeError as e:

print e

exit()

book_price = book_title.find_next(text=price_regexp)

try:

print "Book's price is "+ book_price.strip()

except AttributeError as e:

print e

exit()

print ""

以上全部為本篇文章的全部內容，希望對大家的學習有所幫助，也希望大家多多支持腳本之家。

總結

以上是生活随笔為你收集整理的python 爬虫包_Python爬虫包BeautifulSoup实例（三）的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：通信系统概论_现代通信系统概论第一章
下一篇： websocket python爬虫_p

python

python 爬虫 包_Python爬虫包BeautifulSoup实例（三）

總結

python 爬虫包_Python爬虫包BeautifulSoup实例（三）