给我一份用tensorflow2.0写的关于词向量训练和预测,并作出词向量散点图的代码
时间: 2023-05-29 08:07:26 浏览: 103
常用的深度学习模型训练、评估和预测相关代码,基于Tensorflow高阶API(Estimator)实现;尽量做到可读性和通用性
以下是一个简单的用TensorFlow 2.0实现的词向量训练和预测的代码,同时作出词向量散点图:
```python
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# 定义数据集
corpus_raw = 'He is the king . The king is royal . She is the royal queen '
corpus_raw = corpus_raw.lower()
# 将数据集转换成单词列表
words = []
for word in corpus_raw.split():
if word != '.': # 去除句号
words.append(word)
words = set(words) # 去重
word2int = {}
int2word = {}
vocab_size = len(words) # 词汇表大小
for i, word in enumerate(words):
word2int[word] = i
int2word[i] = word
# 构造训练数据
data = []
WINDOW_SIZE = 2 # 窗口大小
for i, word in enumerate(words):
for j in range(max(i - WINDOW_SIZE, 0), min(i + WINDOW_SIZE, len(words))):
if i != j:
data.append((word, words[j]))
# 定义模型参数
EMBEDDING_DIM = 2 # 词向量维度
learning_rate = 0.1
epochs = 10000
# 定义模型
class Word2Vec(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim):
super(Word2Vec, self).__init__()
self.target_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="w2v_embedding")
self.context_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=WINDOW_SIZE*2, name="w2v_context_embedding")
def call(self, target, context):
target_embedding = self.target_embedding(target)
context_embedding = self.context_embedding(context)
dot = tf.keras.layers.Dot(axes=2)([context_embedding, target_embedding])
dot = tf.keras.layers.Reshape((1,))(dot)
output = tf.keras.layers.Activation('sigmoid')(dot)
return output
model = Word2Vec(vocab_size, EMBEDDING_DIM)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate), metrics=['accuracy'])
# 训练模型
for epoch in range(epochs):
x, y = [], []
for w in data:
x.append(word2int[w[0]])
y.append(word2int[w[1]])
loss, acc = model.train_on_batch([np.array(x), np.array(y)], np.ones(len(x)))
if epoch % 1000 == 0:
print("Epoch:", epoch, "Loss:", loss, "Accuracy:", acc)
# 可视化词向量
weights = model.get_layer('w2v_embedding').get_weights()[0]
for i, word in int2word.items():
x, y = weights[i]
plt.scatter(x, y)
plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()
# 预测词向量
word = 'queen'
word_index = word2int[word]
target_word_vector = model.get_layer('w2v_embedding').get_weights()[0][word_index]
similar_words = []
for i, w in int2word.items():
if w != word:
vector = model.get_layer('w2v_embedding').get_weights()[0][i]
similarity = np.dot(target_word_vector, vector) / (np.linalg.norm(target_word_vector) * np.linalg.norm(vector))
similar_words.append((w, similarity))
similar_words = sorted(similar_words, key=lambda x: x[1], reverse=True)
print(similar_words[:5])
```
上述代码的输出结果包括每1000个epoch的训练损失、准确率,以及词向量散点图和预测词向量的相似词。
阅读全文