word embedding使用了浮點(diǎn)型的稠密矩陣來(lái)表示token,根據(jù)詞典的大小,我們的向量通常使用不同的維度,例如100,256,300等等。其中向量中的每一個(gè)值都是超參數(shù),其初始值都是隨機(jī)生成的,之后會(huì)在訓(xùn)練的過(guò)程中進(jìn)行學(xué)習(xí)而獲得。 我們會(huì)把所有的文本轉(zhuǎn)化為向量,把句子用向量來(lái)表示。 但是在這中間,我們會(huì)先把token使用數(shù)字來(lái)表示,再把數(shù)字使用向量來(lái)表示。
from torch.utils.data import DataLoader,Dataset
from lib import ws,max_len
import torch
import os
import re#利用正則方法分詞,并且去除不必要的符號(hào)deftokenlize(content):re.sub("<.*?>"," ",content)fileters=['\.','\t','\n','\x96','\x97']content=re.sub("|".join(fileters)," ",content)tokens=[i.strip()for i in content.spilt()]return tokensclassImbDataset(Dataset):def__init__(self,train=True):self.train_data_path=r"..."#訓(xùn)練集數(shù)據(jù)讀取路徑self.test_data_path =r"..."#測(cè)試集數(shù)據(jù)讀取路徑data_path=self.train_data_path if train else self.test_data_path#把所有文件名放入列表,即pos和neg兩個(gè)文件夾存入列表temp_data_path=[os.path.join(data_path,"pos"),os.path.join(data_path,"neg")]self.total_file_path=[]#所有的評(píng)論文件的路徑pathfor path in temp_data_path:file_name_list=os.listdir(path)#得到pos或者neg文件夾內(nèi)部所有文件名file_path_list=[os.path.join(path,i)for i in file_name_list if i.endswith(".txt")]#過(guò)濾不是以.txt結(jié)尾的文件self.total_file_path.extend(file_path_list)def__getitem__(self, index):file_path=self.total_file_path[index]#獲取labellabel_str=file_path.split("\\")[-2]label=0if label_str =="neg"else1#獲取內(nèi)容tokens=tokenlize(open(file_path).read())return tokens,labeldef__len__(self):returnlen(self.total_file_path)defcollate_fn(batch):""":param batch: ([tokens,label],[tokens,label],...):return:"""content,label=list(zip(*batch))content=[ws.transform(i,max_len=max_len)for i in content]content=torch.LongTensor(content)#轉(zhuǎn)化成LongTensor,否則在model.py中的embedding()中無(wú)法執(zhí)行,原因是embedding的對(duì)象必須是LongTensorlabel=torch.LongTensor(label)return content,labeldefget_dataloader(train=True):imdb_dataset=ImbDataset()data_loader=DataLoader(imdb_dataset,batch_size=2,shuffle=True,collate_fn=collate_fn)return data_loader
2.word2sequence.py:
import numpy as np
import osclassword2sequence():UNK_TAG="UNK"PAD_TAG="PAD"UNK=0PAD=1def__init__(self):#字典,初始情況下存入兩個(gè)特殊字符self.__dict__={self.UNK_TAG:self.UNK,self.PAD_TAG:self.PAD}self.fited=Falseself.count={}#統(tǒng)計(jì)詞頻deffit(self,sentence):"""把單個(gè)句子保存到dict中:param sentence: [word1,word2,word3...]:return:"""for word in sentence:self.count[word]=self.count.get(word,0)+1#統(tǒng)計(jì)詞頻defbulid_vocab(self,min=0,max=None,max_features=None):"""生成詞典:param min:最小出現(xiàn)的次數(shù):param max:最大的次數(shù):param max_features:保留的詞語(yǔ)數(shù):return:"""# 刪除count中詞頻小于min的單詞ifminisnotNone:self.count={word:value for word,value in self.count if value>=min}# 刪除count中詞頻大于max的單詞ifmaxisnotNone:self.count={word:value for word,value in self.count if value<max}#限制保留的詞語(yǔ)數(shù)if max_features isnotNone:temp=sorted(self.count.items(),key=lambda x:x[-1],reverse=True)[:max_features]#排序,取前max_features個(gè)詞頻的單詞self.count=dict(temp)#sorted之后,結(jié)果為列表,需要重新轉(zhuǎn)換成字典for word in self.count:self.dict[word]=len(self.dict)#給每個(gè)單詞進(jìn)行賦值,由于初始情況已經(jīng)有兩個(gè)特殊字符,所以新進(jìn)入的第一個(gè)單詞的值為2,此后不斷疊加#得到一個(gè)翻轉(zhuǎn)的dict字典(利用鍵和值的重新匹配)self.inverse_dict=dict(zip(self.dict.values(),self.dict.keys()))deftransform(self,sentence,max_len=None):"""把句子轉(zhuǎn)化成序列:param sentence:[wword1,word2,word3...]:param max_len: int 對(duì)句子進(jìn)行填充或者裁剪:return:"""if max_len isnotNone:if max_len>len(sentence):#填充sentence=sentence+[self.PAD_TAG]*(max_len-len(sentence))if max_len<len(sentence):#裁剪sentence=sentence[:max_len]return[self.dict.get(word,self.UNK)for word in sentence]definverse_transform(self,indices):"""把序列轉(zhuǎn)化成句子:param indices: [1,2,3...]:return:"""return[self.dinverse_dict.get(idx)for idx in indicesfrom main import word2sequence
import pickle
from dataset import tokenlize #從一開(kāi)始定義的數(shù)據(jù)集中導(dǎo)入tokenlizefrom tqdm import tqdm#顯示可迭代對(duì)象的加載進(jìn)度if __name__=='__main__':ws=word2sequence()path=r"..."#寫(xiě)入路徑temp_data_path=[os.path.join(path,"pos"),os.path.join(path,"neg")]for data_path in temp_data_path:file_paths=[os,path.join(data_path,file_name)for file_name in os.listdir(data_path)]for file_path in tqdm(file_paths):sentence = tokenlize(open(file_path).read())ws.fit(sentence)ws.bulid_vocab(min=10)pickle.dump(ws,open("./model/ws.pkl","wb"))#保存文件
import torch
import torch.nn as nn
from lib import ws,max_len
import torch.nn.functional as FclassMyModel(nn.Module):def__init__(self):super(MyModel,self).__init__()self.embedding=nn.Embedding(len(ws),100)#兩個(gè)參數(shù),前一個(gè)是訓(xùn)練的詞語(yǔ)的數(shù)量,后一個(gè)是每一個(gè)詞語(yǔ)的維度self.fc=nn.Linear(max_len*100,2)#Linear()函數(shù)對(duì)象必須是二維,所以在forward里面必須進(jìn)行view()操作defforward(self,input):""":param input: [batch_size,max_len]:return:"""x=self.embedding(input)#進(jìn)行embedding操作,形狀成為:[batch_size,max_len,100]x=x.view([-1,max_len*100])out=self.fc(x)return F.log_softmax(out,dim=-1)