Here is just a simple example to demonstrate the logic of self-attention. You can judge whether a sentence is positive or negative.

Import tensorflow as tf import numpy as NP tf.reset_default_graph() # word vector dimension DIM = 2 # hidden layer size hidden = 5 # time step size step = 3 # N = 2 Sentences = [" I love Mengjun "," I like Peipei "," She likes Damao "," She hates Wangda "," Wangda is Good ","mengjun is bad"] labels = [1,1,1,0,1,0] words = list(set(".join(sentences).split())) # dictionary size V = len(words) # Word2idx = {v:k for k,v in enumerate(words)} idx2word = {K: V for k,v in enumerate(words)} # Processing input data input_batch  = [] for sentence in sentences: Input_batch.append ([word2idx[word] for word in sentence.split()]) # Target_batch.append (np.eye(N)[label]) # Embedding = tf.variable (tf.random_normal([V, Out = tf.variable (tf.random_normal([hidden * 2, N])) X = tf.placeholder(tf.int32, [None, Embedding_lookup (embedding_lookup(embedding_lookup) Y = tf.placeholder(tf.int32, [None]) N]) # define forward and reverse LSTM LSTM_FW_cell = tf.nn.rnN_cell.lSTMCell (hidden) LSTM_bw_cell = tF.nn.rnN_cell.lSTMCell (hidden) # define two-way lSTM_FW_cell = tF.nn.rnN_cell [batch_size, step, hidden],[batch_size, step, hidden]) # final_state: (fw:(c:[batch_size, hidden], h:[batch_size, hidden]), bw:(c:[batch_size, hidden], h:[batch_size, hidden])) output, final_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, X_embedding, Dtype =tf.float32) # [batch_size, step, hidden*2] output = tf.concat([output[0], output[1]], 2) # Concatenate c and H in the opposite direction of final_state according to hidden dimension, Final_hidden_state = tf.concat([final_state[1][0], final_state[1][1]], 1) # Add third dimension, [batch_size, hidden*2, 1] final_hidden_state = tf.expand_dims(final_hidden_state, # [batch_size, step, hidden*2] * [batch_size, hidden*2, 1] = squeeze([batch_size, step, hidden*2, 1]) 1]) = [batch_size, step] attn_weights = tf.squeeze(tf.matmul(output, final_hidden_state), 2) # soft_attn_weights = tf.nn.softmax(attn_weights, 1) # The output of each time step and the corresponding weight can be considered as the context matrix [batch_size, hidden*2, step] * [batch_size, step, 1] = [batch_size, hidden*2, 1] context = tf.matmul(tf.transpose(output, [0, 2, 1]), tf.expand_dims(soft_attn_weights, 2)) # squeeze([batch_size, hidden*2, 1]) = [batch_size, Hidden *2] context = tf.squeeze(context, 2) # output probability matrix [batch_size, hidden*2] * [hidden*2, N] = [batch_size, N] model = tf.matmul(context, Cost = tf.reduce_mean(tF.nn.softMAX_cross_entropy_with_logits_v2 (logits=model,labels=Y)) optimizer = Tf.train.AdamOptimizer(0.001). Minimize (cost) # prediction hypothesis = TF.nn. Softmax (model) prediction = tF.argmax (hypothesis, 1) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) for epoch in range(5000): _, loss = sess.run([optimizer, cost], feed_dict={X:input_batch, Y:target_batch}) if (epoch+1) % 1000 == 0: print('epoch ','%06d'%(epoch+1), ' loss ', '%08f'%loss) test_text = [[word2idx[word] for word in 'she hates wangda'.split()]] predict = sess.run([prediction], feed_dict={X: test_text}) print('she hates wangda', '-->', predict[0][0])Copy the code

Results print:

Epoch 001000 Loss 0.001645 epoch 002000 Loss 0.000279 EPOCH 003000 Loss 0.000106 epoch 004000 Loss 0.000052 epoch 005000 Loss 0.000029 She hates Wangda --> 0Copy the code