當前位置：首頁 > 编程语言 > python >内容正文

python

python输入法引擎_Bigram-MLE语言模型和模拟输入法的python实现

發布時間：2023/12/19 python 25 豆豆

生活随笔收集整理的這篇文章主要介紹了 python输入法引擎_Bigram-MLE语言模型和模拟输入法的python实现小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

1 importre2 importjsonlines3

5 #訓練語料：metadata.txt

6 #生成文檔： 1. d.jsonl文件, 2. output.jsonl文件 3. generator_5possible_value.jsonl文件 4. data.txt文件

10 new_ll=[] #詞匯表：存儲單詞

11 dict_file={} #cut函數中，對每一行數據切分后,產生的中間存儲變量, 字典格式：dict{the:[boy,boy, girl,apple, apple,...], boy:[like,like, eat, play,play.....], like:[eatting, playing.....], eatting:[apple,apple.... ].....}

12 total_list=[] #全部單詞(包含重復單詞)

13 d={} #存儲每個單詞以及它的統計頻數, d{read:13, the:10, a:12, book: 15,......... }

14 frequency_dict={} #用于封裝key_value{}的中間存儲變量。

15 key_value={} #在output.jsonl文件中按行存儲的字典

16 ###############################################################################################################

17 key_5value={} #用于封裝value{}的中間存儲變量。

18 word_num=5 #每個單詞后最可能出現的單詞的數目。

19 value={} #generator_5possible_value.jsonl文件中按行存儲的字典

23 #count_list()函數：用于統計詞匯表中的單詞

24 #new_ll中存儲詞匯表中的單詞

25 defcount_list(list_file):26 #用一個列表記錄總共多少種單詞

27 globalnew_ll28 for i inlist_file:29 if i not innew_ll:30 new_ll.append(i)31

34 #存儲字典d (字典中包含鍵值對，例如：{read:13, the:10, a:12, book: 15,......... },生成d.jsonl文件。

35 defsave_dict(line):36 with jsonlines.open('i:\d.jsonl', mode='w') as writer:37 writer.write(line)38

41 #遍歷total_list列表，對列表中的單個單詞進行統計數目，將結果存入d[]字典內，例如：d{ read : 20, the : 30,....}

42 defcount_total_list(total_list):43 #用一個字典記錄結果 ,遍歷列表 , 求count()

44 globalnew_ll45 for i innew_ll:46 d[i]=total_list.count(i) #統計某個單詞在訓練語料中出現的頻數。

47 save_dict(d) #存儲d字典

51 #對一行數據進行切分，例如：the boy like eatting apple, 切分為：{the:[boy], boy:[like], like:[eatting], eatting:[apple].......}

52 #如果dict_file中存在某一個key=result_list[i],將result_list[i+1]添加到dict_file[result_list[i]]的列表中。

53 #構建dict_file字典，字典最終格式：dict{the:[boy,boy, girl,apple, apple,...], boy:[like,like, eat, play,play.....], like:[eatting, playing.....], eatting:[apple,apple.... ].....}

54 defcut(result_list):55 globaldict_file56 for i in range(len(result_list)-1):57 if result_list[i] not indict_file:58 dict_file[result_list[i]]=[result_list[i+1]]59 else:60 dict_file[result_list[i]].append(result_list[i+1])61

63 #對frequency_count()函數產生的key_value{}進行存儲。

64 #以json格式按行存儲字典元素(字典元素中包含鍵值對，每條json數據格式：key_value{read :{ the:10, a:12, book: 15 } } ,即以read開頭的，二元語法及頻數。

65 #生成output.jsonl文件

66 defsave_key_value(line):67 with jsonlines.open('i:\output.jsonl', mode='a') as writer:68 writer.write(line)69

72 #對new_ll(詞匯表)中各個單詞，挑選其后最可能出現的五個單詞，生成generator_5possible_value.jsonl文件, 用于模擬輸入法程序。

73 #value字典的格式形式： value { the:30, a:20, book:15, pen:12, apple:2}

74 defgenerator_5possible_value(i,data):75 globalvalue,key_5value76 data_list =list(data.keys())77 length =len(data_list)78 if length <79 for count inrange value key_5value else:83 with jsonlines.open mode="a" as writer:87 writer.write>

88 key_5value={} #用于封裝value{}的中間存儲變量。

89 value={}90

94 #對于new_ll中的每一個單詞,例如:read，查看是否在dict_file字典中存在key==i，如果存在，對于dict_file[i]中每個j,構建frequency_dict[name]=key，其中name=str(j),key=dict_file[i].count(j)

95 #對 frequency_dict中的對象，按照頻數排序。

96 #再對frequency_dict進行封裝，即：key_value[i]=data

97 #最終key_value字典格式為：key_value{read: {the:10, a:12, book:5,....} } 及二元語法中，read后出現的單詞，以及各自的頻數。

98 deffrequency_count():99 globalkey_value,frequency_dict,new_ll100 for i in new_ll: #new_ll中每個單詞

101 if i in dict_file: #判斷是否在dict_file中存在key==i

102 for j indict_file[i]:103 name=str(j)104 key=dict_file[i].count(j)105 frequency_dict[name]=key106

107 data=dict(sorted(frequency_dict.items(), key=lambda d: d[1], reverse=True))#對 frequency_dict中的對象，按照頻數排序。

108 #print(data)

109 key_value[i] =data #對frequency_dict進行封裝

110 generator_5possible_value(i,data) #對new_ll中各個單詞，挑選其后最可能出現的五個單詞

111 save_key_value(key_value) #對分裝好的key_value進行存儲

112 frequency_dict={}113 key_value ={}114 #frequency.append(frequency_dict)

115 #frequency_dict = {}

116

117

118

119

120 defmain():121 with open('i:\metadata.txt',"r",encoding='utf-8') as f: #設置文件對象

122 with open('i:\data.txt', 'w') as out:123 for line in f.readlines(): #依次讀取每行

124 str_ =str(line)125 #print(str_)

126 str_=str_[::-1]127 count=str_.find("|")128 #print(str_)

129 #print(count)

130 str_=str_[0:count]131 str_=str_[::-1]132 result_list = re.findall('[a-zA-Z0-9]+', str_)133

134 result_list.insert(0,"BOS")135 result_list.append("EOS")136 #以上兩行代碼，對metadata.txt中第三列的每個句子處理為格式：result_list[ 'BOS','the','boy','is','a','very','great','child','EOS']

137 for i inresult_list:138 total_list.append(i)#統計所有單詞，包括重復單詞。

139 #print(result_list)

140 count_list(result_list)141 cut(result_list)142 out.writelines(result_list) #生成data.txt文件，文件中儲存處理后的句子，每個句子處理為格式：result_list[ 'BOS','the','boy','is','a','very','great','child','EOS']。

143 out.writelines("\n")144 #以上部分代碼，負責對metadata.txt中第三列進行處理，total_list中存儲切分后的所有單詞(包括重復單詞).

145 count_total_list(total_list) #遍歷total_list列表，對列表中的單個單詞進行統計數目，將結果存入d[]字典內，例如：d{ read : 20, the : 30,....}

146 print("字典"+str(d))147 print(len(d))148 frequency_count() #生成key_value字典, 最終key_value字典格式為：key_value{read: {the:10, a:12, book:5,....} } 及二元語法中，read后出現的單詞，以及各自的頻數。

149 print("字符" +str(new_ll))150 print(len(new_ll))151 print("鍵-值" +str(dict_file))152 print(len(dict_file))153 print(len(total_list))154 out.close()155 f.close()156

157

158

159

160 if __name__ == "__main__":161 main()

79> 創作挑戰賽新人創作獎勵來咯，堅持創作打卡瓜分現金大獎

總結

以上是生活随笔為你收集整理的python输入法引擎_Bigram-MLE语言模型和模拟输入法的python实现的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： dotty编译器语法特性之一交叉类型，联
下一篇： vpython 贞测碰撞_python碰