python作业网站_python大作业
利用python對(duì)豆瓣電影評(píng)價(jià)的爬取,并生成詞云
一、抓取網(wǎng)頁(yè)數(shù)據(jù)
第一步要對(duì)網(wǎng)頁(yè)進(jìn)行訪問(wèn),python中使用的是urllib庫(kù)。代碼如下:
from urllib import request
resp = request.urlopen(‘https://movie.douban.com/nowplaying/hangzhou/‘)
html_data = resp.read().decode(‘utf-8‘)
第二步,需要對(duì)得到的html代碼進(jìn)行解析,得到里面提取我們需要的數(shù)據(jù)。
在python中使用BeautifulSoup庫(kù)進(jìn)行html代碼的解析。
BeautifulSoup使用的格式如下:
BeautifulSoup(html,"html.parser")
第一個(gè)參數(shù)為需要提取數(shù)據(jù)的html,第二個(gè)參數(shù)是指定解析器,然后使用find_all()讀取html標(biāo)簽中的內(nèi)容
from bs4 import BeautifulSoup as bs soup = bs(html_data, ‘html.parser‘) nowplaying_movie = soup.find_all(‘div‘, id=‘nowplaying‘) nowplaying_movie_list = nowplaying_movie[0].find_all(‘li‘, class_=‘list-item‘)
在上圖中可以看到data-subject屬性里面放了電影的id號(hào)碼,而在img標(biāo)簽的alt屬性里面放了電影的名字,因此我們就通過(guò)這兩個(gè)屬性來(lái)得到電影的id和名稱(chēng)。(注:打開(kāi)電影短評(píng)的網(wǎng)頁(yè)時(shí)需要用到電影的id,所以需要對(duì)它進(jìn)行解析),編寫(xiě)代碼如下:
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict[‘id‘] = item[‘data-subject‘]
for tag_img_item in item.find_all(‘img‘):
nowplaying_dict[‘name‘] = tag_img_item[‘a(chǎn)lt‘]
nowplaying_list.append(nowplaying_dict)
二、數(shù)據(jù)清洗
為了方便進(jìn)行數(shù)據(jù)進(jìn)行清洗,我們將列表中的數(shù)據(jù)放在一個(gè)字符串?dāng)?shù)組中,代碼如下:
comments = ‘‘
for k in range(len(eachCommentList)):
comments = comments + (str(eachCommentList[k])).strip()
三、用詞云進(jìn)行顯示
代碼如下:
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams[‘figure.figsize‘] = (10.0, 5.0)
from wordcloud import WordCloud#詞云包
wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80) #指定字體類(lèi)型、字體大小和字體顏色
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
word_frequence_list = []
for key in word_frequence:
temp = (key,word_frequence[key])
word_frequence_list.append(temp)
wordcloud=wordcloud.fit_words(word_frequence_list)
plt.imshow(wordcloud)
付源碼:
完整代碼
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings("ignore")
import jieba # 分詞包
import numpy # numpy計(jì)算包
import codecs # codecs提供的open方法來(lái)指定打開(kāi)的文件的語(yǔ)言編碼,它會(huì)在讀取的時(shí)候自動(dòng)轉(zhuǎn)換為內(nèi)部unicode
import re
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from urllib import request
from bs4 import BeautifulSoup as bs
from wordcloud import WordCloud,ImageColorGenerator # 詞云包
import matplotlib
matplotlib.rcParams[‘figure.figsize‘] = (10.0, 5.0)
# 分析網(wǎng)頁(yè)函數(shù)
def getNowPlayingMovie_list():
resp = request.urlopen(‘https://movie.douban.com/nowplaying/hangzhou/‘)
html_data = resp.read().decode(‘utf-8‘)
soup = bs(html_data, ‘html.parser‘)
nowplaying_movie = soup.find_all(‘div‘, id=‘nowplaying‘)
nowplaying_movie_list = nowplaying_movie[0].find_all(‘li‘, class_=‘list-item‘)
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict[‘id‘] = item[‘data-subject‘]
for tag_img_item in item.find_all(‘img‘):
nowplaying_dict[‘name‘] = tag_img_item[‘a(chǎn)lt‘]
nowplaying_list.append(nowplaying_dict)
return nowplaying_list
# 爬取評(píng)論函數(shù)
def getCommentsById(movieId, pageNum):
eachCommentList = []
if pageNum > 0:
start = (pageNum - 1) * 20
else:
return False
requrl = ‘https://movie.douban.com/subject/‘ + movieId + ‘/comments‘ + ‘?‘ + ‘start=‘ + str(start) + ‘&limit=20‘
print(requrl)
resp = request.urlopen(requrl)
html_data = resp.read().decode(‘utf-8‘)
soup = bs(html_data, ‘html.parser‘)
comment_div_lits = soup.find_all(‘div‘, class_=‘comment‘)
for item in comment_div_lits:
if item.find_all(‘p‘)[0].string is not None:
eachCommentList.append(item.find_all(‘p‘)[0].string)
return eachCommentList
def main():
# 循環(huán)獲取第一個(gè)電影的前10頁(yè)評(píng)論
commentList = []
NowPlayingMovie_list = getNowPlayingMovie_list()
for i in range(10):
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[0][‘id‘], num)
commentList.append(commentList_temp)
# 將列表中的數(shù)據(jù)轉(zhuǎn)換為字符串
comments = ‘‘
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()
# 使用正則表達(dá)式去除標(biāo)點(diǎn)符號(hào)
pattern = re.compile(r‘[\u4e00-\u9fa5]+‘)
filterdata = re.findall(pattern, comments)
cleaned_comments = ‘‘.join(filterdata)
# 使用結(jié)巴分詞進(jìn)行中文分詞
segment = jieba.lcut(cleaned_comments)
words_df = pd.DataFrame({‘segment‘: segment})
# 去掉停用詞
stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=[‘stopword‘],
encoding=‘utf-8‘) # quoting=3全不引用
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
# 統(tǒng)計(jì)詞頻
words_stat = words_df.groupby(by=[‘segment‘])[‘segment‘].agg({"計(jì)數(shù)": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["計(jì)數(shù)"], ascending=False)
# print(words_stat.head())
bg_pic = numpy.array(Image.open("alice_mask.png"))
# 用詞云進(jìn)行顯示
wordcloud = WordCloud(
font_path="simhei.ttf",
background_color="white",
max_font_size=80,
width = 2000,
height = 1800,
mask = bg_pic,
mode = "RGBA"
)
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
# print(word_frequence)
"""
word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)
#print(word_frequence_list)
"""
wordcloud = wordcloud.fit_words(word_frequence)
image_colors = ImageColorGenerator(bg_pic) # 根據(jù)圖片生成詞云顏色
plt.imshow(wordcloud) #顯示詞云圖片
plt.axis("off")
plt.show()
wordcloud.to_file(‘show_Chinese.png‘) # 把詞云保存下來(lái)
main()
總結(jié)
以上是生活随笔為你收集整理的python作业网站_python大作业的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: jmeter+mysql+set_Jme
- 下一篇: websocket python爬虫_p