写一篇word2vec的实现代码
时间: 2023-07-07 18:03:05 浏览: 28
Word2Vec是一种常用的词向量表示方法,它通过神经网络模型将单词映射到向量空间中,从而可以直接使用向量进行文本处理和分析。下面是一个简单的Word2Vec实现代码,使用Python语言和TensorFlow框架。
首先需要准备一个文本数据集,这里使用了一个小型的英文新闻文本数据集作为示例。代码需要先对数据进行预处理,将文本中的单词转换成数值表示。
```python
import tensorflow as tf
import numpy as np
import collections
import os
# 读取数据
def read_data(filename):
with open(filename, 'r') as f:
data = f.read().split()
return data
# 构建词汇表
def build_vocab(data, vocab_size):
# 统计单词出现频次
word_count = [['UNK', -1]]
word_count.extend(collections.Counter(data).most_common(vocab_size - 1))
# 创建词汇表
vocab_dict = {}
for word, count in word_count:
vocab_dict[word] = len(vocab_dict)
# 将数据集中的单词转换为数值表示
data_vocab = []
unk_count = 0
for word in data:
if word in vocab_dict:
index = vocab_dict[word]
else:
index = 0 # UNK
unk_count += 1
data_vocab.append(index)
word_count[0][1] = unk_count
return data_vocab, vocab_dict, word_count
# 生成训练数据
def generate_train_data(data, window_size):
train_data = []
for i in range(len(data)):
for j in range(1, window_size+1):
if i-j >= 0:
train_data.append([data[i], data[i-j]])
if i+j < len(data):
train_data.append([data[i], data[i+j]])
return train_data
# 读取数据集
data = read_data('news.txt')
vocab_size = 5000
data, vocab_dict, word_count = build_vocab(data, vocab_size)
train_data = generate_train_data(data, window_size=2)
```
接下来就是Word2Vec模型的构建,这里使用了Skip-gram模型。模型的输入是一个单词的数值表示,输出是它周围的单词的数值表示,即使用一个单词预测它的上下文。模型的核心是一个嵌入层,将每个单词映射到一个向量空间中,然后使用点积计算相似度。
```python
# 定义Word2Vec模型
class Word2Vec:
def __init__(self, vocab_size, embed_size):
self.vocab_size = vocab_size
self.embed_size = embed_size
self.inputs = tf.placeholder(tf.int32, [None])
self.labels = tf.placeholder(tf.int32, [None, 1])
# 定义嵌入层
with tf.variable_scope('embed'):
self.embeddings = tf.Variable(tf.random_uniform([vocab_size, embed_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(self.embeddings, self.inputs)
# 定义输出层
with tf.variable_scope('output'):
self.weights = tf.Variable(tf.truncated_normal([vocab_size, embed_size], stddev=1.0 / np.sqrt(embed_size)))
self.biases = tf.Variable(tf.zeros([vocab_size]))
self.logits = tf.matmul(embed, tf.transpose(self.weights)) + self.biases
# 定义损失函数和优化器
self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(self.weights, self.biases, self.labels, embed, num_sampled=1000, num_classes=vocab_size))
self.optimizer = tf.train.AdagradOptimizer(learning_rate=0.1).minimize(self.loss)
# 定义训练函数
def train_word2vec(train_data, vocab_size, embed_size, num_epochs, batch_size, save_path):
tf.reset_default_graph()
model = Word2Vec(vocab_size, embed_size)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
total_loss = 0.0
for epoch in range(num_epochs):
np.random.shuffle(train_data)
for i in range(0, len(train_data), batch_size):
batch_inputs, batch_labels = [], []
for j in range(i, min(i+batch_size, len(train_data))):
batch_inputs.append(train_data[j][0])
batch_labels.append([train_data[j][1]])
loss, _ = sess.run([model.loss, model.optimizer], feed_dict={model.inputs: batch_inputs, model.labels: batch_labels})
total_loss += loss
if epoch % 10 == 0:
print('Epoch %d, average loss: %.4f' % (epoch, total_loss / len(train_data)))
total_loss = 0.0
# 保存模型
if not os.path.exists(save_path):
os.mkdir(save_path)
model_file = os.path.join(save_path, 'word2vec.ckpt')
saver = tf.train.Saver()
saver.save(sess, model_file)
# 训练Word2Vec模型
embed_size = 100
num_epochs = 100
batch_size = 512
save_path = 'model'
train_word2vec(train_data, vocab_size, embed_size, num_epochs, batch_size, save_path)
```
训练完成后,就可以使用训练好的模型进行单词向量的表示和相似度计算了。代码如下:
```python
# 加载模型
def load_word2vec(vocab_dict, embed_size, save_path):
tf.reset_default_graph()
model = Word2Vec(len(vocab_dict), embed_size)
with tf.Session() as sess:
model_file = os.path.join(save_path, 'word2vec.ckpt')
saver = tf.train.Saver()
saver.restore(sess, model_file)
embeddings = sess.run(model.embeddings)
# 创建词向量字典
word_vectors = {}
for word, index in vocab_dict.items():
word_vectors[word] = embeddings[index]
return word_vectors
# 计算单词相似度
def calc_similarity(word1, word2, word_vectors):
vec1 = word_vectors[word1]
vec2 = word_vectors[word2]
sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
return sim
# 加载模型并计算相似度
word_vectors = load_word2vec(vocab_dict, embed_size, save_path)
print(calc_similarity('man', 'woman', word_vectors))
```
以上就是一个简单的Word2Vec实现代码,可以用于生成单词向量并计算相似度。实际应用中,可以根据需求对模型进行优化和改进。
相关推荐


















