BTM模型计算困惑度的代码
时间: 2024-05-02 15:16:14 浏览: 205
BTM模型计算困惑度的代码如下:
```python
import math
def perplexity(testset, trainset, model, K, beta):
phi = model.phi()
log_per = 0
N = 0
for doc in testset:
theta = model.inference(doc)
for w in doc:
log_per_doc = 0
for k in range(0, K):
log_per_doc += phi[k][w] * theta[k]
log_per += math.log(log_per_doc)
N += len(doc)
return math.exp(-log_per / N)
```
其中,`testset`是测试数据集,`trainset`是训练数据集,`model`是已经训练好的BTM模型,`K`是主题个数,`beta`是超参数。这个函数的返回值是测试数据集的困惑度。
相关问题
计算BTM模型一致性的代码
计算BTM模型一致性的代码如下:
```python
from collections import Counter
import numpy as np
def get_cosine_sim(mat):
"""
计算词向量的余弦相似度矩阵
:param mat: 词向量矩阵
:return: 余弦相似度矩阵
"""
cos_mat = mat.dot(mat.T)
norms = np.linalg.norm(mat, axis=1)
norms[norms == 0] = 1e-8
norm_mat = np.outer(norms, norms)
return cos_mat / norm_mat
def get_topic_coherence(beta, doc_term_mat, vocab, top_n=10):
"""
计算主题一致性
:param beta: 主题-词分布矩阵
:param doc_term_mat: 文档-词矩阵
:param vocab: 词汇表
:param top_n: 取每个主题前top_n个词
:return: 主题一致性值
"""
topic_coherence = []
for k in range(beta.shape[0]):
top_words = beta[k].argsort()[:-top_n:-1]
word_pairs = []
for i in range(len(top_words)):
for j in range(i+1, len(top_words)):
word_pairs.append((vocab[top_words[i]], vocab[top_words[j]]))
word_counts = Counter(vocab)
window_size = 3
co_occur = {}
for doc in doc_term_mat:
doc_len = len(doc)
for i in range(doc_len):
word_i = vocab[doc[i]]
if word_i in top_words:
for j in range(max(0, i - window_size), min(doc_len, i + window_size + 1)):
if j != i:
word_j = vocab[doc[j]]
if word_j in top_words:
if (word_i, word_j) in co_occur:
co_occur[(word_i, word_j)] += 1
else:
co_occur[(word_i, word_j)] = 1
coherence = []
for word_pair in word_pairs:
w1, w2 = word_pair
co_count = co_occur.get((w1, w2), 0)
w1_count = word_counts[w1]
w2_count = word_counts[w2]
coherence.append(np.log((co_count + 1) / w1_count))
coherence.append(np.log((co_count + 1) / w2_count))
topic_coherence.append(np.mean(coherence))
return np.mean(topic_coherence)
# 示例代码
beta = np.array([[0.1, 0.2, 0.3, 0.4],
[0.4, 0.3, 0.2, 0.1],
[0.2, 0.1, 0.4, 0.3],
[0.3, 0.4, 0.1, 0.2]])
doc_term_mat = np.array([[0, 1, 2, 3],
[1, 2, 3, 0],
[2, 3, 0, 1],
[3, 0, 1, 2]])
vocab = ['apple', 'banana', 'cat', 'dog']
cos_sim = get_cosine_sim(beta)
print('主题余弦相似度矩阵:')
print(cos_sim)
topic_coherence = get_topic_coherence(beta, doc_term_mat, vocab)
print('主题一致性值:')
print(topic_coherence)
```
在这个示例代码中,我们使用了一个由4个主题和4个单词组成的语料库,计算了主题的余弦相似度矩阵和主题一致性值。在实际应用中,我们需要将上面的示例代码替换成我们的BTM模型和语料库,然后就可以计算模型的一致性了。
btm模型 python代码
以下是用Python实现BTM模型的代码:
```python
import numpy as np
import scipy.sparse as sp
class BTM:
def __init__(self, num_topics, V):
self.num_topics = num_topics
self.V = V
self.alpha = None
self.beta = None
self.theta = None
self.phi = None
self.z = None
self.word_topic_counts = None
self.topic_counts = None
self.num_iterations = None
def fit(self, docs, num_iterations=100, alpha=0.1, beta=0.01):
self.alpha = alpha
self.beta = beta
self.num_iterations = num_iterations
# Initialize variables
M = len(docs)
self.theta = np.zeros((M, self.num_topics))
self.phi = np.zeros((self.num_topics, self.V))
self.z = []
self.word_topic_counts = sp.lil_matrix((self.V, self.num_topics))
self.topic_counts = np.zeros(self.num_topics)
# Randomly assign topics to words
for m in range(M):
doc = docs[m]
z = []
for w in doc:
topic = np.random.randint(self.num_topics)
z.append(topic)
self.word_topic_counts[w, topic] += 1
self.topic_counts[topic] += 1
self.z.append(np.array(z))
# Gibbs sampling
for i in range(self.num_iterations):
for m in range(M):
doc = docs[m]
z = self.z[m]
for n in range(len(doc)):
w = doc[n]
topic = z[n]
self.word_topic_counts[w, topic] -= 1
self.topic_counts[topic] -= 1
# Calculate posterior distribution over topics
p_z = (self.word_topic_counts[w, :] + self.beta) * \
(self.topic_counts + self.alpha) / \
(self.topic_counts.sum() + self.alpha * self.num_topics)
p_z /= p_z.sum()
# Sample new topic assignment
new_topic = np.random.choice(self.num_topics, p=p_z)
z[n] = new_topic
self.word_topic_counts[w, new_topic] += 1
self.topic_counts[new_topic] += 1
# Calculate theta and phi
for m in range(M):
self.theta[m, :] = (self.word_topic_counts[docs[m], :] + self.alpha) / \
(len(docs[m]) + self.alpha * self.num_topics)
self.phi = (self.word_topic_counts + self.beta) / \
(self.word_topic_counts.sum(axis=0) + self.beta * self.V)
def transform(self, docs):
M = len(docs)
theta = np.zeros((M, self.num_topics))
for m in range(M):
doc = docs[m]
for w in doc:
theta[m, :] += self.phi[:, w]
theta[m, :] /= len(doc)
return theta
```
代码中使用的是Gibbs采样算法,将文本集合划分为若干文档,每个文档根据BTM模型进行主题分布计算。主题分布计算完成后,可以使用transform函数将文档转换为主题分布。
阅读全文