tf cbow获得词向量的示例代码
时间: 2023-08-01 08:03:10 浏览: 104
tf cbow是一种利用TensorFlow实现的词向量生成算法,在使用CBOW模型生成词向量时,我们可以按照以下示例代码进行操作:
```python
import tensorflow as tf
import numpy as np
# 定义训练数据
corpus = ["我 喜欢 吃 苹果", "我 喜欢 吃 香蕉", "我 喜欢 吃 橙子"]
# 构建词汇表
vocab = set(" ".join(corpus).split())
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}
# 定义超参数
window_size = 2
embed_size = 5
learning_rate = 0.001
num_epochs = 100
# 定义CBOW模型
def cbow_model(window_size, embed_size, vocab_size):
x = tf.placeholder(tf.float32, shape=[None, window_size*2, vocab_size])
y = tf.placeholder(tf.float32, shape=[None, vocab_size])
# 定义权重和偏置项
weights = {
'hidden': tf.Variable(tf.random_normal([window_size*2*embed_size, embed_size])),
'output': tf.Variable(tf.random_normal([embed_size, vocab_size]))
}
biases = {
'hidden': tf.Variable(tf.random_normal([embed_size])),
'output': tf.Variable(tf.random_normal([vocab_size]))
}
# 定义网络结构
x_vector = tf.reshape(x, [-1, window_size*2*vocab_size])
hidden_layer = tf.add(tf.matmul(x_vector, weights['hidden']), biases['hidden'])
hidden_layer = tf.nn.relu(hidden_layer)
output_layer = tf.add(tf.matmul(hidden_layer, weights['output']), biases['output'])
return output_layer
# 构建训练数据
train_data = []
for sentence in corpus:
words = sentence.split()
for idx, word in enumerate(words):
context = [words[i] for i in range(idx-window_size, idx+window_size+1) if i>=0 and i<len(words) and i!=idx]
target = word
train_data.append((context, target))
# 构建输入和标签数据
train_x = np.zeros((len(train_data), window_size*2, len(vocab)), dtype=np.float32)
train_y = np.zeros((len(train_data), len(vocab)), dtype=np.float32)
for i, data in enumerate(train_data):
context, target = data
for j, word in enumerate(context):
train_x[i, j, word2idx[word]] = 1.0
train_y[i, word2idx[target]] = 1.0
# 定义模型和损失函数
vocab_size = len(vocab)
model = cbow_model(window_size, embed_size, vocab_size)
prediction = tf.nn.softmax(model)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=model, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
# 训练模型
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(num_epochs):
_, c = sess.run([optimizer, loss], feed_dict={x: train_x, y: train_y})
print("Epoch:", epoch+1, "Loss:", c)
# 获取词向量
embeddings = sess.run(model, feed_dict={x: train_x})
# 输出词向量
for i, embedding in enumerate(embeddings):
word = idx2word[i]
print("Word:", word, "Embedding:", embedding)
```
在以上示例代码中,我们首先定义了训练数据和词汇表。然后通过定义CBOW模型的网络结构、超参数配置和损失函数等,来训练生成词向量。最后,在训练完成后,可以输出每个词的词向量。
阅读全文