基于python的nlp预备知识
生活随笔
收集整理的這篇文章主要介紹了
基于python的nlp预备知识
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
基于python的nlp預備知識
- 載入語料庫
- brown 語料庫的導入
- 分詞
- nltk的word_tokenize
- Stem抽取題干和Lemma 詞形還原
- NLTK實現Stemming三種方式
- NLTK實現Lemma 詞形還原
- 停止詞
- 關鍵詞打分
- 情感分析
- 文本相似度
- 用Frequency 頻率統計計算文本相似度
- TF-IDF
載入語料庫
import nltk nltk.download('stopwords') nltk.download('punkt') nltk.download('brown')brown 語料庫的導入
# corpus是一個語料庫,brown是brown大學制作的語料庫,關于標題的分類 from nltk.corpus import brown brown.categories() len(brown.sents()) # 多少條句子 len(brown.words()) # 多少個詞分詞
nltk的word_tokenize
import nltk sentence = 'hello, world' tokens = nltk.word_tokenize(sentence) # 調用庫nltk的word_tokenize進行分詞 tokens[‘hello’, ‘,’, ‘world’]
Stem抽取題干和Lemma 詞形還原
NLTK實現Stemming三種方式
# 從輸出可以看出,lancaster詞干提取器最為嚴格, # 他的速度很快,但是會減少單詞的很大部分,會讓詞干模糊難于理解print('第1種方式'+'*'*100) # 1 from nltk.stem.porter import PorterStemmerporter_stemmer = PorterStemmer() porter_stemmer.stem('maximum') # 'maximum' porter_stemmer.stem('presumably') # 'presum' porter_stemmer.stem('multiply') # 'multipli' porter_stemmer.stem('working') # workprint('第2種方式'+'*'*100) # 2 from nltk.stem.lancaster import LancasterStemmer lancaster_stemmer = LancasterStemmer() lancaster_stemmer.stem('maximum') # 'maxim' lancaster_stemmer.stem('presumably') # 'presum' lancaster_stemmer.stem('multiply') # 'multiply' porter_stemmer.stem('working') # workprint('第3種方式'+'*'*100) # 3 from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english') snowball_stemmer.stem('maximum') # 'maximum' snowball_stemmer.stem('presumably') # 'presum' snowball_stemmer.stem('multiply') # 'multipli' porter_stemmer.stem('working') # workNLTK實現Lemma 詞形還原
# NLTK實現Lemma 詞形還原 >>> from nltk.stem import WordNetLemmatizer>>> wordnet_lemmatizer = WordNetLemmatizer() >>> wordnet_lemmatizer.lemmatize('dogs') # 'dog' >>> wordnet_lemmatizer.lemmatize('churches') # 'church' >>> wordnet_lemmatizer.lemmatize('aardwolves') # 'aardwolf' >>> wordnet_lemmatizer.lemmatize('abaci') # 'abacus' >>> wordnet_lemmatizer.lemmatize('working') # working屬于stemming,詞干抽取,所以沒用 >>> wordnet_lemmatizer.lemmatize('are') # are >>> wordnet_lemmatizer.lemmatize('are',pos = 'v') # be停止詞
from nltk.corpus import stopwordssentence = 'food is my family' word_list = nltk.word_tokenize(sentence) # 分詞filtered_words = [word for word in word_list if word not in stopwords.words('english')] filtered_words[‘food’, ‘is’, ‘my’, ‘family’]
[‘food’, ‘family’]
停止詞網站
關鍵詞打分
dict.get(key, default=None)
key – 字典中要查找的鍵。
default – 如果指定鍵的值不存在時,返回該默認值值。
返回指定鍵的值,如果值不在字典中返回默認值None。
5
AFINN-111
情感分析
# 情感分析 from nltk.classify import NaiveBayesClassifier # 樸素貝葉斯# 隨手造點訓練集 s1 = 'this is a good book' s2 = 'this is a awesome book' s3 = 'this is a bad book' s4 = 'this is a terrible book'def preprocess(s):return {word: True for word in s.lower().split()} # 巧妙的表達方式# {'this': True, 'is':True, 'a':True, 'good':True, 'book':True}# 當然啦, 我們以后可以升級這個方程, 比如 word2vec# 把訓練集給做成標準形式 training_data = [[preprocess(s1), 'pos'],[preprocess(s2), 'pos'],[preprocess(s3), 'neg'],[preprocess(s4), 'neg']]# 喂給model吃 model = NaiveBayesClassifier.train(training_data)# 打出結果 print(training_data) print(model.classify(preprocess('this is a bad book'))) # neg[[{‘this’: True, ‘is’: True, ‘a’: True, ‘good’: True, ‘book’: True}, ‘pos’], [{‘this’: True, ‘is’: True, ‘a’: True, ‘awesome’: True, ‘book’: True}, ‘pos’], [{‘this’: True, ‘is’: True, ‘a’: True, ‘bad’: True, ‘book’: True}, ‘neg’], [{‘this’: True, ‘is’: True, ‘a’: True, ‘terrible’: True, ‘book’: True}, ‘neg’]]
文本相似度
用Frequency 頻率統計計算文本相似度
"""功能:用元素頻次表示文本特征,計算文本相似度缺點:用頻次計算,丟失位置特征 """ import nltk from nltk import FreqDist import numpy as np import pandas as pd########### 制作詞庫,返回詞庫中所有單詞的頻次 ################# # 做個詞庫先 corpus = 'this is my sentence ' \'this is my life ' \'this is the day' # corpus # 'this is my sentence this is my life this is the day'# 隨便tokenize一下,這里可以根據需要做任何的preprocessing:stopwords, lemma, stemming, etc. tokens = nltk.word_tokenize(corpus)# NLTK的FreqDist統計一下文字出現的頻率 fdist = FreqDist(tokens) # fdist類似于一個Dict # FreqDist({'this': 3, 'is': 3, 'my': 2, 'sentence': 1, 'life': 1, 'the': 1, 'day': 1})# 帶上某個單詞, 可以看到它在整個文章中出現的次數 # print(fdist['is']) # 3# 好, 此刻, 我們可以把最常用的50個單詞拿出來 standard_freq_vector = fdist.most_common(50) # 返回頻次前50的列表,單詞和頻次呈元祖格式 # [('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)] size = len(standard_freq_vector) # 7, 詞庫有7個def position_lookup(v):""":param v: 列表,里面是元祖格式的單詞和他對應的頻次[('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]:return: loc: v中所有單詞和對應的位置fre: v中所有單詞的頻次"""loc = {}fre = []counter = 0for word in v: # word遍歷v ('this', 3)loc[word[0]] = counterfre.append(word[1])counter += 1return loc, fre# 把標準的單詞位置記錄下來 loc, fre = position_lookup(standard_freq_vector) # loc: {'this': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6} # fre: [3, 3, 2, 1, 1, 1, 1]# 將詞對應的位置和頻次,輸出pd格式 standard_vector = [key for key, value in loc.items()] df = pd.DataFrame({'詞庫': np.array(standard_vector), '詞庫頻次': fre}) print(df)################## 三個sentence,從詞庫中找sentence所有單詞出現的頻次 ######################## # 如果我們有個新句句?子: sentence1 = 'this is my life ' sentence2 = 'this is my sentence ' sentence3 = 'life my is this' sentence = [sentence1, sentence2, sentence3]def vec(sen_tok, loc):# 先新建一個跟我們的標準vector同樣?大?小的向量量freq_vector = [0] * sizefor word in sen_tok:try:# 如果在我們的詞庫?里里出現過,在"標準位置"上+1freq_vector[loc[word]] += 1except KeyError:# 如果是個新詞,就pass掉continue# print(freq_vector)return freq_vectortokens = [nltk.word_tokenize(i) for i in sentence] # 將三個句子分詞 # [['this', 'is', 'my', 'life'], ['this', 'is', 'my', 'sentence'], ['life', 'my', 'is', 'this']]sent_fre = [vec(i, loc) for i in tokens] # 分別計算三個句子中單詞在詞庫中出現的頻次,如果是新詞pass,所以要求詞庫要全面 # [[1, 1, 1, 0, 1, 0, 0], [1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 1, 0, 0]]# pd格式 df['sen1_頻次'] = sent_fre[0] df['sen2_頻次'] = sent_fre[1] df['sen3_頻次'] = sent_fre[2] print(df) ############### 按照頻次,依據余弦定理計算sen1與sen2,sen1與sen3的相似度 #################### # 余弦值越大,證明夾角越小,兩個向量越相似 # 分母計算模時,剛好是2范數, # 引入np.linalg.norm(表達式,ord = 2) sen1_sen2_simi = (np.sum(df['sen1_頻次']*df['sen2_頻次']))\/(np.linalg.norm(df['sen1_頻次'], ord=2) * np.linalg.norm(df['sen2_頻次'], ord=2))sen1_sen3_simi = (np.sum(df['sen1_頻次']*df['sen3_頻次']))\/(np.linalg.norm(df['sen1_頻次'], ord=2) * np.linalg.norm(df['sen3_頻次'], ord=2))print('sen1與sen2的相似度', sen1_sen2_simi) print('sen1與sen3的相似度', sen1_sen3_simi)# 可以看出雖然sen1與sen3風馬牛不相及,但相似度達到最大,只因為是按照頻次計算相似度。TF-IDF
# NLTK實現TF-IDF # 文檔數:3個 import nltk from nltk.text import TextCollection# 三個文檔總數 sents = ['this is sentence one', 'this is sentence two', 'this is sentence three'] # 分詞 sents = [nltk.word_tokenize(sent) for sent in sents] # 放入 TextCollection corpus = TextCollection(sents)# 計算idf,驗證公式 corpus.idf('this') # np.log(3/3)=log(一共3個文檔/出現this的文檔數為3)=0 corpus.idf('three') # np.log(3/1)= 1.0986122886681098# 計算tf,idf corpus.tf('three', nltk.word_tokenize('one two three, go')) # 1/5 corpus.tf_idf('three', nltk.word_tokenize('one two three, go')) # 1/5 * 1.0986122886681098=0.21972245773362198# 對于每個新句? new_sentence = 'is three, go'# 遍歷一遍所有的new_sentence中的詞: for word in nltk.word_tokenize(new_sentence):print(word, ':', 'TF-IDF', corpus.tf_idf(word, nltk.word_tokenize(new_sentence)))# is因為在三個文檔都有,所以它在新句子的重要性為0總結
以上是生活随笔為你收集整理的基于python的nlp预备知识的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: TensorFlow基于cifar10数
- 下一篇: 基于python的打印进度条、计算用时