nlp 停用词处理java_NLP入门:文本预处理(一)停用词
文本預(yù)處理——去停用詞
停用詞文本可以從https://pan.baidu.com/s/1q21hIK95QU9qDstptd8V8g 自提,不謝
該停用詞文本轉(zhuǎn)自https://blog.csdn.net/FontThrone/article/details/74200026,自己還未創(chuàng)建新的停用詞,后續(xù)更新。。。。
# - * - coding: utf - 8 -*-
import sys
# 獲取停用詞的List
def GetListOfStopWords(filepath):
f_stop = open(filepath, encoding='utf-8')
try:
f_stop_text = f_stop.read()
finally:
f_stop.close()
f_stop_seg_list = f_stop_text.split('\n')
return f_stop_seg_list
# 保存List
# def SaveFile(list, filename):
# f_stop = open(filename, 'w', encoding='utf-8')
# for item in range(len(list)):
# if item != len(list):
# f_stop.writelines((list[item].encode('utf-8')) + '\n')
# else:
# f_stop.writelines(list[item].encode('utf-8'))
# f_stop.close()
# 求List并集
def GetListUnion(listName):
ListUnion = ['!']
for item in listName:
# print(item)
ListUnion.extend(GetListOfStopWords(item))
return list(set(ListUnion))
def GetStopWords(listOfFileName, FileName='CNstopwords.txt', keynumber=1):
stopwords_pathCN = stop_dir + 'CNstopwords.txt' # 默認(rèn)中文總表 1
stopwords_pathEN = stop_dir + 'ENstopwords.txt' # 默認(rèn)英文總表 2
stopwords_pathCNEN = stop_dir + 'CNENstopwords.txt' # 默認(rèn)中英文混合總表 4
if keynumber == 1:
listOfFileName.append(stopwords_pathCN)
elif keynumber == 2:
listOfFileName.append(stopwords_pathEN)
elif keynumber == 3:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathEN)
elif keynumber == 5:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathCNEN)
elif keynumber == 6:
listOfFileName.append(stopwords_pathEN)
listOfFileName.append(stopwords_pathCNEN)
elif keynumber == 7:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathEN)
listOfFileName.append(stopwords_pathCNEN)
else:
listOfFileName.append(stopwords_pathCN)
print('The keynumber is wrong,change keynumber to 1 ')
listOfFileName.append(stopwords_pathCNEN)
ListUnion = GetListUnion(listOfFileName)
return ListUnion
# SaveFile(ListUnion, FileName)
listOfFileName = []
# 需要添加的 中文 停用詞詞表
stop_dir = "./stopwords/"
stopwords_path1 = stop_dir + 'stopwords1893.txt'
stopwords_path2 = stop_dir + 'stopwords1229.txt'
stopwords_path3 = stop_dir + 'stopwordshagongdakuozhan.txt'
stopwords_path4 = stop_dir + 'stop_words_zh.txt'
# 需要添加的 英文 停用詞詞表
stopwords_path5 = stop_dir + 'stop_words_eng.txt'
stopwords_path6 = stop_dir + 'ENstopwords891.txt'
# 需要添加的 中文 停用詞詞表路徑
listOfFileName.append(stopwords_path1)
listOfFileName.append(stopwords_path2)
listOfFileName.append(stopwords_path3)
listOfFileName.append(stopwords_path4)
# 需要添加的 英文 停用詞詞表路徑
listOfFileName.append(stopwords_path5)
listOfFileName.append(stopwords_path6)
res = GetStopWords(listOfFileName, FileName=stop_dir + 'ENstopwords.txt', keynumber=2)
總結(jié)
以上是生活随笔為你收集整理的nlp 停用词处理java_NLP入门:文本预处理(一)停用词的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【OS学习笔记】三十八 保护模式十:中断
- 下一篇: 【OS学习笔记】八 实模式:编写主引导扇