當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

han模型理解

發布時間：2025/3/19 编程问答 26 豆豆

生活随笔收集整理的這篇文章主要介紹了 han模型理解小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一、han模型有兩個重要特征，第一是分層，word-level層與sentence-level層，符合文檔結構；第二個就是使用注意力機制（在加權時，可以根據內容賦予動態權重）；

二、han模型如下：

首先是one-hot的嵌入式表示，即embedding

然后再經過word-level編碼層，這個有很多選擇，論文中選擇了雙向GRU模型，得到每個word的編碼

然后再經過注意力層計算出每個word編碼的權值，用于線性加權；這里有個重點就是注意力層中Q，即圖中的Uw，代表context vector，語義向量，是隨機初始化的，不對應任何輸入；V代表word的編碼（GRU層輸出的隱藏狀態），K是將V經過一個FNN層的輸出；?證明如原文的記錄：

然后上面就完成了一個句子的編碼；

然后多個句子組成輸入，即基于sentence-level，經過編碼層（雙向GRU），（本質上和word-level一模一樣），輸出一個文檔向量

最后，經過一個線性轉換變成得分，再加softmax層輸出分類概率值；如圖所示（v是文檔向量）：

三、模型設置與訓練：
? ? ?a、先處理文本，分詞化

? ? b、使用word2vec模型訓練得到word2vec矩陣；用于初始化han模型中嵌入層；嵌入層輸出維度為200，編碼層輸出維度為100（每個方向各占50），語義向量維度也為100；

? ?c、batchsize為64，動量值為0.9，學習率用grid?search搜索得到；

四、han定義模型代碼：
?

#coding=utf8import tensorflow as tf from tensorflow.contrib import rnn from tensorflow.contrib import layersdef length(sequences):used = tf.sign(tf.reduce_max(tf.abs(sequences), reduction_indices=2))seq_len = tf.reduce_sum(used, reduction_indices=1)return tf.cast(seq_len, tf.int32)class HAN():def __init__(self, vocab_size, num_classes, embedding_size=200, hidden_size=50):self.vocab_size = vocab_sizeself.num_classes = num_classesself.embedding_size = embedding_sizeself.hidden_size = hidden_sizewith tf.name_scope('placeholder'):self.max_sentence_num = tf.placeholder(tf.int32, name='max_sentence_num')self.max_sentence_length = tf.placeholder(tf.int32, name='max_sentence_length')self.batch_size = tf.placeholder(tf.int32, name='batch_size')#x的shape為[batch_size, 句子數，句子長度(單詞個數)]，但是每個樣本的數據都不一樣，，所以這里指定為空#y的shape為[batch_size, num_classes]self.input_x = tf.placeholder(tf.int32, [None, None, None], name='input_x')self.input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')#構建模型word_embedded = self.word2vec()sent_vec = self.sent2vec(word_embedded)doc_vec = self.doc2vec(sent_vec)out = self.classifer(doc_vec)self.out = outdef word2vec(self):with tf.name_scope("embedding"):embedding_mat = tf.Variable(tf.truncated_normal((self.vocab_size, self.embedding_size)))#shape為[batch_size, sent_in_doc, word_in_sent, embedding_size]word_embedded = tf.nn.embedding_lookup(embedding_mat, self.input_x)return word_embeddeddef sent2vec(self, word_embedded):with tf.name_scope("sent2vec"):#GRU的輸入tensor是[batch_size, max_time, ...].在構造句子向量時max_time應該是每個句子的長度，所以這里將#batch_size * sent_in_doc當做是batch_size.這樣一來，每個GRU的cell處理的都是一個單詞的詞向量#并最終將一句話中的所有單詞的詞向量融合（Attention）在一起形成句子向量#shape為[batch_size*sent_in_doc, word_in_sent, embedding_size]word_embedded = tf.reshape(word_embedded, [-1, self.max_sentence_length, self.embedding_size])#shape為[batch_size*sent_in_doce, word_in_sent, hidden_size*2]word_encoded = self.BidirectionalGRUEncoder(word_embedded, name='word_encoder')#shape為[batch_size*sent_in_doc, hidden_size*2]sent_vec = self.AttentionLayer(word_encoded, name='word_attention')return sent_vecdef doc2vec(self, sent_vec):with tf.name_scope("doc2vec"):sent_vec = tf.reshape(sent_vec, [-1, self.max_sentence_num, self.hidden_size*2])#shape為[batch_size, sent_in_doc, hidden_size*2]doc_encoded = self.BidirectionalGRUEncoder(sent_vec, name='sent_encoder')#shape為[batch_szie, hidden_szie*2]doc_vec = self.AttentionLayer(doc_encoded, name='sent_attention')return doc_vecdef classifer(self, doc_vec):with tf.name_scope('doc_classification'):out = layers.fully_connected(inputs=doc_vec, num_outputs=self.num_classes, activation_fn=None)return outdef BidirectionalGRUEncoder(self, inputs, name):#輸入inputs的shape是[batch_size, max_time, voc_size]with tf.variable_scope(name):GRU_cell_fw = rnn.GRUCell(self.hidden_size)GRU_cell_bw = rnn.GRUCell(self.hidden_size)#fw_outputs和bw_outputs的size都是[batch_size, max_time, hidden_size]((fw_outputs, bw_outputs), (_, _)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=GRU_cell_fw,cell_bw=GRU_cell_bw,inputs=inputs,sequence_length=length(inputs),dtype=tf.float32)#outputs的size是[batch_size, max_time, hidden_size*2]outputs = tf.concat((fw_outputs, bw_outputs), 2)return outputsdef AttentionLayer(self, inputs, name):#inputs是GRU的輸出，size是[batch_size, max_time, encoder_size(hidden_size * 2)]with tf.variable_scope(name):# u_context是上下文的重要性向量，用于區分不同單詞/句子對于句子/文檔的重要程度,# 因為使用雙向GRU，所以其長度為2×hidden_szieu_context = tf.Variable(tf.truncated_normal([self.hidden_size * 2]), name='u_context')#使用一個全連接層編碼GRU的輸出的到期隱層表示,輸出u的size是[batch_size, max_time, hidden_size * 2]h = layers.fully_connected(inputs, self.hidden_size * 2, activation_fn=tf.nn.tanh)#shape為[batch_size, max_time, 1]alpha = tf.nn.softmax(tf.reduce_sum(tf.multiply(h, u_context), axis=2, keep_dims=True), dim=1)#reduce_sum之前shape為[batch_szie, max_time, hidden_szie*2]，之后shape為[batch_size, hidden_size*2]atten_output = tf.reduce_sum(tf.multiply(inputs, alpha), axis=1)return atten_output

五、訓練代碼：

#coding=utf-8 import tensorflow as tf import time import os from data_helper import load_dataset from HAN_model import HAN# Data loading params tf.flags.DEFINE_string("yelp_json_path", 'data/yelp_academic_dataset_review.json', "data directory") tf.flags.DEFINE_integer("vocab_size", 46960, "vocabulary size") tf.flags.DEFINE_integer("num_classes", 5, "number of classes") tf.flags.DEFINE_integer("embedding_size", 200, "Dimensionality of character embedding (default: 200)") tf.flags.DEFINE_integer("hidden_size", 50, "Dimensionality of GRU hidden layer (default: 50)") tf.flags.DEFINE_integer("batch_size", 32, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 50)") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") tf.flags.DEFINE_integer("max_sent_in_doc", 30, "Number of checkpoints to store (default: 5)") tf.flags.DEFINE_integer("max_word_in_sent", 30, "Number of checkpoints to store (default: 5)") tf.flags.DEFINE_integer("evaluate_every", 100, "evaluate every this many batches") tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate") tf.flags.DEFINE_float("grad_clip", 5, "grad clip to prevent gradient explode")FLAGS = tf.flags.FLAGStrain_x, train_y, dev_x, dev_y = load_dataset(FLAGS.yelp_json_path, FLAGS.max_sent_in_doc, FLAGS.max_word_in_sent) print "data load finished"with tf.Session() as sess:han = HAN(vocab_size=FLAGS.vocab_size,num_classes=FLAGS.num_classes,embedding_size=FLAGS.embedding_size,hidden_size=FLAGS.hidden_size)with tf.name_scope('loss'):loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=han.input_y,logits=han.out,name='loss'))with tf.name_scope('accuracy'):predict = tf.argmax(han.out, axis=1, name='predict')label = tf.argmax(han.input_y, axis=1, name='label')acc = tf.reduce_mean(tf.cast(tf.equal(predict, label), tf.float32))timestamp = str(int(time.time()))out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))print("Writing to {}\n".format(out_dir))global_step = tf.Variable(0, trainable=False)optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)# RNN中常用的梯度截斷，防止出現梯度過大難以求導的現象tvars = tf.trainable_variables()grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), FLAGS.grad_clip)grads_and_vars = tuple(zip(grads, tvars))train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)# Keep track of gradient values and sparsity (optional)grad_summaries = []for g, v in grads_and_vars:if g is not None:grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)grad_summaries.append(grad_hist_summary)grad_summaries_merged = tf.summary.merge(grad_summaries)loss_summary = tf.summary.scalar('loss', loss)acc_summary = tf.summary.scalar('accuracy', acc)train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])train_summary_dir = os.path.join(out_dir, "summaries", "train")train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)dev_summary_op = tf.summary.merge([loss_summary, acc_summary])dev_summary_dir = os.path.join(out_dir, "summaries", "dev")dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))checkpoint_prefix = os.path.join(checkpoint_dir, "model")if not os.path.exists(checkpoint_dir):os.makedirs(checkpoint_dir)saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)sess.run(tf.global_variables_initializer())def train_step(x_batch, y_batch):feed_dict = {han.input_x: x_batch,han.input_y: y_batch,han.max_sentence_num: 30,han.max_sentence_length: 30,han.batch_size: 64}_, step, summaries, cost, accuracy = sess.run([train_op, global_step, train_summary_op, loss, acc], feed_dict)time_str = str(int(time.time()))print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy))train_summary_writer.add_summary(summaries, step)return stepdef dev_step(x_batch, y_batch, writer=None):feed_dict = {han.input_x: x_batch,han.input_y: y_batch,han.max_sentence_num: 30,han.max_sentence_length: 30,han.batch_size: 64}step, summaries, cost, accuracy = sess.run([global_step, dev_summary_op, loss, acc], feed_dict)time_str = str(int(time.time()))print("++++++++++++++++++dev++++++++++++++{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy))if writer:writer.add_summary(summaries, step)for epoch in range(FLAGS.num_epochs):print('current epoch %s' % (epoch + 1))for i in range(0, 200000, FLAGS.batch_size):x = train_x[i:i + FLAGS.batch_size]y = train_y[i:i + FLAGS.batch_size]step = train_step(x, y)if step % FLAGS.evaluate_every == 0:dev_step(dev_x, dev_y, dev_summary_writer)

代碼來源：https://github.com/Irvinglove/HAN-text-classification/blob/master

總結

以上是生活随笔為你收集整理的han模型理解的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

模型
han

上一篇： xgboost与coo_matrix
下一篇： coo_maxtrix保存到本地