分别基于SVD分解以及基于SGNS两种方法构建词向量并进行评测的代码
时间: 2024-05-07 07:22:41 浏览: 185
1. 基于SVD分解构建词向量的代码
```python
import numpy as np
from sklearn.decomposition import TruncatedSVD
class SVDWordEmbedding:
def __init__(self, corpus, dim=100, window_size=5):
self.corpus = corpus
self.dim = dim
self.window_size = window_size
self.vocab = {}
self.vectors = []
self.build_vocab()
self.build_vectors()
def build_vocab(self):
words = []
for sentence in self.corpus:
words.extend(sentence)
for word in words:
if word not in self.vocab:
self.vocab[word] = len(self.vocab)
def build_vectors(self):
co_matrix = np.zeros((len(self.vocab), len(self.vocab)))
for sentence in self.corpus:
for i, word in enumerate(sentence):
if word in self.vocab:
for j in range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1)):
if i != j and sentence[j] in self.vocab:
co_matrix[self.vocab[word]][self.vocab[sentence[j]]] += 1
svd = TruncatedSVD(n_components=self.dim)
self.vectors = svd.fit_transform(co_matrix)
def get_vector(self, word):
if word in self.vocab:
return self.vectors[self.vocab[word]]
else:
return np.zeros(self.dim)
def most_similar(self, word, topn=10):
if word not in self.vocab:
return []
vec = self.get_vector(word)
sims = self.vectors.dot(vec)
most_similar_idx = np.argsort(sims)[::-1][:topn]
return [(list(self.vocab.keys())[i], sims[i]) for i in most_similar_idx]
```
2. 基于SGNS构建词向量的代码
```python
import numpy as np
import tensorflow as tf
class SGNSWordEmbedding:
def __init__(self, corpus, dim=100, window_size=5, neg_samples=5, batch_size=128, learning_rate=0.01, epochs=10):
self.corpus = corpus
self.dim = dim
self.window_size = window_size
self.neg_samples = neg_samples
self.batch_size = batch_size
self.learning_rate = learning_rate
self.epochs = epochs
self.vocab = {}
self.build_vocab()
self.build_model()
def build_vocab(self):
words = []
for sentence in self.corpus:
words.extend(sentence)
for word in words:
if word not in self.vocab:
self.vocab[word] = len(self.vocab)
def build_model(self):
self.graph = tf.Graph()
with self.graph.as_default():
self.input_word = tf.placeholder(tf.int32, [None])
self.output_word = tf.placeholder(tf.int32, [None, 1])
self.neg_word = tf.placeholder(tf.int32, [None, self.neg_samples])
self.embedding = tf.Variable(tf.random_uniform([len(self.vocab), self.dim], -1.0, 1.0))
self.input_embed = tf.nn.embedding_lookup(self.embedding, self.input_word)
self.output_embed = tf.nn.embedding_lookup(self.embedding, self.output_word)
self.neg_embed = tf.nn.embedding_lookup(self.embedding, self.neg_word)
self.weights = tf.Variable(tf.truncated_normal([len(self.vocab), self.dim], stddev=1.0 / np.sqrt(self.dim)))
self.biases = tf.Variable(tf.zeros([len(self.vocab)]))
self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=self.weights,
biases=self.biases,
inputs=self.input_embed,
labels=self.output_word,
num_sampled=self.neg_samples,
num_classes=len(self.vocab)))
self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)
self.init = tf.global_variables_initializer()
def train(self):
with tf.Session(graph=self.graph) as sess:
sess.run(self.init)
for epoch in range(self.epochs):
total_loss = 0.0
for i, sentence in enumerate(self.corpus):
word_pairs = self.build_word_pairs(sentence)
for j in range(0, len(word_pairs), self.batch_size):
batch_pairs = word_pairs[j:j + self.batch_size]
inputs, outputs = zip(*batch_pairs)
neg_words = self.generate_neg_samples(inputs)
_, loss = sess.run([self.optimizer, self.loss], feed_dict={
self.input_word: inputs,
self.output_word: np.array(outputs)[:, np.newaxis],
self.neg_word: neg_words
})
total_loss += loss
print('Epoch %d, Loss: %f' % (epoch, total_loss))
self.vectors = sess.run(self.embedding)
def build_word_pairs(self, sentence):
word_pairs = []
for i, word in enumerate(sentence):
if word in self.vocab:
for j in range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1)):
if i != j and sentence[j] in self.vocab:
word_pairs.append((self.vocab[word], self.vocab[sentence[j]]))
return word_pairs
def generate_neg_samples(self, inputs):
neg_words = []
for _ in range(len(inputs)):
neg_samples = []
while len(neg_samples) < self.neg_samples:
word = np.random.choice(list(self.vocab.keys()))
if self.vocab[word] not in inputs and self.vocab[word] not in neg_samples:
neg_samples.append(self.vocab[word])
neg_words.append(neg_samples)
return np.array(neg_words)
def get_vector(self, word):
if word in self.vocab:
return self.vectors[self.vocab[word]]
else:
return np.zeros(self.dim)
def most_similar(self, word, topn=10):
if word not in self.vocab:
return []
vec = self.get_vector(word)
sims = self.vectors.dot(vec)
most_similar_idx = np.argsort(sims)[::-1][:topn]
return [(list(self.vocab.keys())[i], sims[i]) for i in most_similar_idx]
```
阅读全文