word2vec代码实现
时间: 2023-07-07 10:23:39 浏览: 104
word2vec:word2vec 实现
以下是一个简单的 Word2Vec 实现的代码示例:
```python
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder
class Word2Vec:
def __init__(self, window_size=2, embedding_size=10, learning_rate=0.1, num_epochs=50):
self.window_size = window_size
self.embedding_size = embedding_size
self.learning_rate = learning_rate
self.num_epochs = num_epochs
self.word2id = {}
self.id2word = {}
self.vocab_size = 0
self.W1 = None
self.W2 = None
def fit(self, corpus):
self.build_vocab(corpus)
self.init_weights()
for epoch in range(self.num_epochs):
for center_word, context_words in self.generate_training_data(corpus):
center_word_one_hot = self.get_one_hot(center_word)
h = self.W1.dot(center_word_one_hot)
u = self.W2.dot(h)
y_pred = self.softmax(u)
y_true = self.get_one_hot(context_words)
e = y_pred - y_true
dW2 = e.dot(h.T)
dW1 = np.outer(self.W2.T.dot(e), center_word_one_hot)
self.update_weights(dW1, dW2)
def build_vocab(self, corpus):
word_freq = defaultdict(int)
for sentence in corpus:
for word in sentence.split():
word_freq[word] += 1
self.word2id = {word: i for i, word in enumerate(sorted(word_freq.keys()))}
self.id2word = {i: word for word, i in self.word2id.items()}
self.vocab_size = len(self.word2id)
def init_weights(self):
self.W1 = np.random.rand(self.embedding_size, self.vocab_size)
self.W2 = np.random.rand(self.vocab_size, self.embedding_size)
def generate_training_data(self, corpus):
for sentence in corpus:
sentence_words = sentence.split()
for i, center_word in enumerate(sentence_words):
for j in range(max(0, i - self.window_size), i):
yield center_word, sentence_words[j]
for j in range(i + 1, min(i + self.window_size + 1, len(sentence_words))):
yield center_word, sentence_words[j]
def get_one_hot(self, word):
one_hot = np.zeros(self.vocab_size)
one_hot[self.word2id[word]] = 1
return one_hot
def softmax(self, x):
exp_x = np.exp(x - np.max(x))
return exp_x / np.sum(exp_x)
def update_weights(self, dW1, dW2):
self.W1 -= self.learning_rate * dW1
self.W2 -= self.learning_rate * dW2
def transform(self, word):
return self.W1[:, self.word2id[word]]
```
上述代码实现了一个简单的 Skip-Gram Word2Vec 模型。在 `fit` 方法中,我们首先构建了词汇表,并随机初始化了模型的参数矩阵。然后,我们对语料库进行遍历,生成训练数据。对于每个中心词和上下文词对,我们使用独热编码将它们转换为向量形式,并通过前向传播计算模型的预测值。然后,我们使用交叉熵损失函数计算预测值和实际值之间的误差,使用反向传播算法计算梯度,并更新模型参数。最后,我们通过 `transform` 方法将每个词转换为其对应的词向量。
需要注意的是,这仅是一个简单的实现示例,实际应用中可能需要对代码进行优化和调整。
阅读全文