當前位置：首頁 > 编程语言 > python >内容正文

python

基于python的nlp预备知识

發布時間：2025/3/21 python 16 豆豆

生活随笔收集整理的這篇文章主要介紹了基于python的nlp预备知识小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

基于python的nlp預備知識

載入語料庫
- brown 語料庫的導入
分詞
- nltk的word_tokenize
Stem抽取題干和Lemma 詞形還原
- NLTK實現Stemming三種方式
- NLTK實現Lemma 詞形還原
停止詞
關鍵詞打分
情感分析
文本相似度
- 用Frequency 頻率統計計算文本相似度
TF-IDF

載入語料庫

import nltk nltk.download('stopwords') nltk.download('punkt') nltk.download('brown')

brown 語料庫的導入

# corpus是一個語料庫，brown是brown大學制作的語料庫，關于標題的分類 from nltk.corpus import brown brown.categories() len(brown.sents()) # 多少條句子 len(brown.words()) # 多少個詞

分詞

nltk的word_tokenize

import nltk sentence = 'hello, world' tokens = nltk.word_tokenize(sentence) # 調用庫nltk的word_tokenize進行分詞 tokens

[‘hello’, ‘,’, ‘world’]

Stem抽取題干和Lemma 詞形還原

NLTK實現Stemming三種方式

# 從輸出可以看出，lancaster詞干提取器最為嚴格， # 他的速度很快，但是會減少單詞的很大部分，會讓詞干模糊難于理解print('第1種方式'+'*'*100) # 1 from nltk.stem.porter import PorterStemmerporter_stemmer = PorterStemmer() porter_stemmer.stem('maximum') # 'maximum' porter_stemmer.stem('presumably') # 'presum' porter_stemmer.stem('multiply') # 'multipli' porter_stemmer.stem('working') # workprint('第2種方式'+'*'*100) # 2 from nltk.stem.lancaster import LancasterStemmer lancaster_stemmer = LancasterStemmer() lancaster_stemmer.stem('maximum') # 'maxim' lancaster_stemmer.stem('presumably') # 'presum' lancaster_stemmer.stem('multiply') # 'multiply' porter_stemmer.stem('working') # workprint('第3種方式'+'*'*100) # 3 from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english') snowball_stemmer.stem('maximum') # 'maximum' snowball_stemmer.stem('presumably') # 'presum' snowball_stemmer.stem('multiply') # 'multipli' porter_stemmer.stem('working') # work

NLTK實現Lemma 詞形還原

# NLTK實現Lemma 詞形還原 >>> from nltk.stem import WordNetLemmatizer>>> wordnet_lemmatizer = WordNetLemmatizer() >>> wordnet_lemmatizer.lemmatize('dogs') # 'dog' >>> wordnet_lemmatizer.lemmatize('churches') # 'church' >>> wordnet_lemmatizer.lemmatize('aardwolves') # 'aardwolf' >>> wordnet_lemmatizer.lemmatize('abaci') # 'abacus' >>> wordnet_lemmatizer.lemmatize('working') # working屬于stemming，詞干抽取，所以沒用 >>> wordnet_lemmatizer.lemmatize('are') # are >>> wordnet_lemmatizer.lemmatize('are',pos = 'v') # be

停止詞

from nltk.corpus import stopwordssentence = 'food is my family' word_list = nltk.word_tokenize(sentence) # 分詞filtered_words = [word for word in word_list if word not in stopwords.words('english')] filtered_words

[‘food’, ‘is’, ‘my’, ‘family’]
[‘food’, ‘family’]
停止詞網站

關鍵詞打分

dict.get(key, default=None)
key – 字典中要查找的鍵。
default – 如果指定鍵的值不存在時，返回該默認值值。
返回指定鍵的值，如果值不在字典中返回默認值None。

# 情感分析打分 sentiment_dictionary = {} # {'abandon': -2, 'abandoned': -2,'abandons': -2...} for line in open("data/AFINN-111.txt"): # 一行一行讀第一行 abandon -2word, score = line.split('\t') # 按照tab鍵分開兩詞sentiment_dictionary[word] = int(score) # 字典格式放入# 把這個打分表記錄在一個Dict上以后 # 跑一遍整個句子，把對應的值相加 sentence = 'like love' words = nltk.word_tokenize(sentence)total_score = sum(sentiment_dictionary.get(word, 0) for word in words) # 方法不錯 # 有值就是Dict中的值，沒有就是0 total_score

5
AFINN-111

情感分析

# 情感分析 from nltk.classify import NaiveBayesClassifier # 樸素貝葉斯# 隨手造點訓練集 s1 = 'this is a good book' s2 = 'this is a awesome book' s3 = 'this is a bad book' s4 = 'this is a terrible book'def preprocess(s):return {word: True for word in s.lower().split()} # 巧妙的表達方式# {'this': True, 'is':True, 'a':True, 'good':True, 'book':True}# 當然啦, 我們以后可以升級這個方程, 比如 word2vec# 把訓練集給做成標準形式 training_data = [[preprocess(s1), 'pos'],[preprocess(s2), 'pos'],[preprocess(s3), 'neg'],[preprocess(s4), 'neg']]# 喂給model吃 model = NaiveBayesClassifier.train(training_data)# 打出結果 print(training_data) print(model.classify(preprocess('this is a bad book'))) # neg

[[{‘this’: True, ‘is’: True, ‘a’: True, ‘good’: True, ‘book’: True}, ‘pos’], [{‘this’: True, ‘is’: True, ‘a’: True, ‘awesome’: True, ‘book’: True}, ‘pos’], [{‘this’: True, ‘is’: True, ‘a’: True, ‘bad’: True, ‘book’: True}, ‘neg’], [{‘this’: True, ‘is’: True, ‘a’: True, ‘terrible’: True, ‘book’: True}, ‘neg’]]

文本相似度

用Frequency 頻率統計計算文本相似度

"""功能：用元素頻次表示文本特征，計算文本相似度缺點：用頻次計算，丟失位置特征 """ import nltk from nltk import FreqDist import numpy as np import pandas as pd########### 制作詞庫，返回詞庫中所有單詞的頻次 ################# # 做個詞庫先 corpus = 'this is my sentence ' \'this is my life ' \'this is the day' # corpus # 'this is my sentence this is my life this is the day'# 隨便tokenize一下,這里可以根據需要做任何的preprocessing:stopwords, lemma, stemming, etc. tokens = nltk.word_tokenize(corpus)# NLTK的FreqDist統計一下文字出現的頻率 fdist = FreqDist(tokens) # fdist類似于一個Dict # FreqDist({'this': 3, 'is': 3, 'my': 2, 'sentence': 1, 'life': 1, 'the': 1, 'day': 1})# 帶上某個單詞, 可以看到它在整個文章中出現的次數 # print(fdist['is']) # 3# 好, 此刻, 我們可以把最常用的50個單詞拿出來 standard_freq_vector = fdist.most_common(50) # 返回頻次前50的列表，單詞和頻次呈元祖格式 # [('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)] size = len(standard_freq_vector) # 7，詞庫有7個def position_lookup(v):""":param v: 列表，里面是元祖格式的單詞和他對應的頻次[('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]:return: loc： v中所有單詞和對應的位置fre： v中所有單詞的頻次"""loc = {}fre = []counter = 0for word in v: # word遍歷v ('this', 3)loc[word[0]] = counterfre.append(word[1])counter += 1return loc, fre# 把標準的單詞位置記錄下來 loc, fre = position_lookup(standard_freq_vector) # loc： {'this': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6} # fre： [3, 3, 2, 1, 1, 1, 1]# 將詞對應的位置和頻次，輸出pd格式 standard_vector = [key for key, value in loc.items()] df = pd.DataFrame({'詞庫': np.array(standard_vector), '詞庫頻次': fre}) print(df)################## 三個sentence，從詞庫中找sentence所有單詞出現的頻次 ######################## # 如果我們有個新句句?子: sentence1 = 'this is my life ' sentence2 = 'this is my sentence ' sentence3 = 'life my is this' sentence = [sentence1, sentence2, sentence3]def vec(sen_tok, loc):# 先新建一個跟我們的標準vector同樣?大?小的向量量freq_vector = [0] * sizefor word in sen_tok:try:# 如果在我們的詞庫?里里出現過,在"標準位置"上+1freq_vector[loc[word]] += 1except KeyError:# 如果是個新詞,就pass掉continue# print(freq_vector)return freq_vectortokens = [nltk.word_tokenize(i) for i in sentence] # 將三個句子分詞 # [['this', 'is', 'my', 'life'], ['this', 'is', 'my', 'sentence'], ['life', 'my', 'is', 'this']]sent_fre = [vec(i, loc) for i in tokens] # 分別計算三個句子中單詞在詞庫中出現的頻次，如果是新詞pass，所以要求詞庫要全面 # [[1, 1, 1, 0, 1, 0, 0], [1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 1, 0, 0]]# pd格式 df['sen1_頻次'] = sent_fre[0] df['sen2_頻次'] = sent_fre[1] df['sen3_頻次'] = sent_fre[2] print(df) ############### 按照頻次，依據余弦定理計算sen1與sen2，sen1與sen3的相似度 #################### # 余弦值越大，證明夾角越小，兩個向量越相似 # 分母計算模時，剛好是2范數， # 引入np.linalg.norm(表達式，ord = 2) sen1_sen2_simi = (np.sum(df['sen1_頻次']*df['sen2_頻次']))\/(np.linalg.norm(df['sen1_頻次'], ord=2) * np.linalg.norm(df['sen2_頻次'], ord=2))sen1_sen3_simi = (np.sum(df['sen1_頻次']*df['sen3_頻次']))\/(np.linalg.norm(df['sen1_頻次'], ord=2) * np.linalg.norm(df['sen3_頻次'], ord=2))print('sen1與sen2的相似度', sen1_sen2_simi) print('sen1與sen3的相似度', sen1_sen3_simi)# 可以看出雖然sen1與sen3風馬牛不相及，但相似度達到最大，只因為是按照頻次計算相似度。

TF-IDF

# NLTK實現TF-IDF # 文檔數：3個 import nltk from nltk.text import TextCollection# 三個文檔總數 sents = ['this is sentence one', 'this is sentence two', 'this is sentence three'] # 分詞 sents = [nltk.word_tokenize(sent) for sent in sents] # 放入 TextCollection corpus = TextCollection(sents)# 計算idf,驗證公式 corpus.idf('this') # np.log(3/3)=log(一共3個文檔/出現this的文檔數為3)=0 corpus.idf('three') # np.log(3/1)= 1.0986122886681098# 計算tf,idf corpus.tf('three', nltk.word_tokenize('one two three, go')) # 1/5 corpus.tf_idf('three', nltk.word_tokenize('one two three, go')) # 1/5 * 1.0986122886681098=0.21972245773362198# 對于每個新句? new_sentence = 'is three, go'# 遍歷一遍所有的new_sentence中的詞: for word in nltk.word_tokenize(new_sentence):print(word, ':', 'TF-IDF', corpus.tf_idf(word, nltk.word_tokenize(new_sentence)))# is因為在三個文檔都有，所以它在新句子的重要性為0

總結

以上是生活随笔為你收集整理的基于python的nlp预备知识的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： TensorFlow基于cifar10数
下一篇：基于python的打印进度条、计算用时