import pandas as pd from openpyxl import Workbook # 获取主题下词语的概率分布 def get_topic_word_distribution(lda, tf_feature_names): arr = lda.transform(tf_vectorizer.transform([' '.join(tf_feature_names)])) return arr[0] # 打印主题下词语的概率分布 def print_topic_word_distribution(lda, tf_feature_names, n_top_words): dist = get_topic_word_distribution(lda, tf_feature_names) for i in range(lda.n_components): print("Topic {}: {}".format(i, ', '.join("{:.4f}".format(x) for x in dist[i]))) # 输出每个主题下词语的概率分布至Excel表格 def output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words,filename): # 创建Excel工作簿和工作表 wb = Workbook() ws = wb.active ws.title = "Topic Word Distribution" # 添加表头 ws.cell(row=1, column=1).value = "Topic" for j in range(n_top_words): ws.cell(row=1, column=j+2).value = tf_feature_names[j] # 添加每个主题下词语的概率分布 dist = get_topic_word_distribution(lda, tf_feature_names) for i in range(lda.n_components): ws.cell(row=i+2, column=1).value = i for j in range(n_top_words): ws.cell(row=i+2, column=j+2).value = dist[i][j] # 保存Excel文件 wb.save(filename) n_top_words = 30 tf_feature_names = tf_vectorizer.get_feature_names() topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words)报错Traceback (most recent call last): File "D:\python\lda3\data_1.py", line 157, in <module> topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words) File "D:\python\lda3\data_1.py", line 130, in print_topic_word_distribution print("Topic {}: {}".format(i, ', '.join("{:.4f}".format(x) for x in dist[i]))) TypeError: 'numpy.float64' object is not iterable
时间: 2024-02-06 17:11:52 浏览: 25
这个错误提示是因为 `dist[i]` 是一个 `numpy.float64` 类型的对象,不是一个可迭代的对象,无法使用 `{:.4f}` 格式化它。你可以尝试修改 print_topic_word_distribution 函数中的代码,将 `dist[i]` 直接格式化为字符串即可,例如:
```
def print_topic_word_distribution(lda, tf_feature_names, n_top_words):
dist = get_topic_word_distribution(lda, tf_feature_names)
for i in range(lda.n_components):
dist_str = ', '.join("{:.4f}".format(dist[i])) # 将 dist[i] 直接格式化为字符串
print("Topic {}: {}".format(i, dist_str))
```
另外,你在调用 `print_topic_word_distribution` 函数时,似乎没有给 `topic_word` 赋值,可以直接将调用语句改为 `print_topic_word_distribution(lda, tf_feature_names, n_top_words)` 即可。
相关问题
import pandas as pd from openpyxl import Workbook # 获取主题下词语的概率分布 def get_topic_word_distribution(lda, tf_feature_names): arr = lda.transform(tf_vectorizer.transform([' '.join(tf_feature_names)])) return arr[0] # 打印主题下词语的概率分布 def print_topic_word_distribution(lda, tf_feature_names, n_top_words,n_topics): dist = get_topic_word_distribution(lda, tf_feature_names,n_topics) for i in range(n_topics): print("Topic {}: {}".format(i, ', '.join("{:.4f}".format(x) for x in dist[i]))) # 输出每个主题下词语的概率分布至Excel表格 def output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, n_topics,filename): # 创建Excel工作簿和工作表 wb = Workbook() ws = wb.active ws.title = "Topic Word Distribution" # 添加表头 ws.cell(row=1, column=1).value = "Topic" for j in range(n_top_words): ws.cell(row=1, column=j+2).value = tf_feature_names[j] # 添加每个主题下词语的概率分布 dist = get_topic_word_distribution(lda, tf_feature_names, n_topics) for i in range(n_topics): ws.cell(row=i+2, column=1).value = i for j in range(n_top_words): ws.cell(row=i+2, column=j+2).value = dist[i][j] # 保存Excel文件 wb.save(filename) n_top_words = 30 tf_feature_names = tf_vectorizer.get_feature_names() topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words, n_topics) #print_topic_word_distribution(lda, tf_feature_names, n_top_words) output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, n_topics, "topic_word_distribution.xlsx")报错Traceback (most recent call last): File "D:\python\lda3\data_1.py", line 157, in <module> topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words, n_topics) File "D:\python\lda3\data_1.py", line 128, in print_topic_word_distribution dist = get_topic_word_distribution(lda, tf_feature_names,n_topics) TypeError: get_topic_word_distribution() takes 2 positional arguments but 3 were given
错误提示表明 get_topic_word_distribution() 函数只需要两个位置参数,但是在 print_topic_word_distribution() 函数中却传递了三个位置参数。检查一下代码,发现 print_topic_word_distribution() 函数中还传递了一个 n_topics 参数,它应该被移除。修改代码如下:
```
def print_topic_word_distribution(lda, tf_feature_names, n_top_words):
dist = get_topic_word_distribution(lda, tf_feature_names)
for i in range(lda.n_components):
print("Topic {}: {}".format(i, ', '.join("{:.4f}".format(x) for x in dist[i])))
def output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, filename):
# 创建Excel工作簿和工作表
wb = Workbook()
ws = wb.active
ws.title = "Topic Word Distribution"
# 添加表头
ws.cell(row=1, column=1).value = "Topic"
for j in range(n_top_words):
ws.cell(row=1, column=j+2).value = tf_feature_names[j]
# 添加每个主题下词语的概率分布
dist = get_topic_word_distribution(lda, tf_feature_names)
for i in range(lda.n_components):
ws.cell(row=i+2, column=1).value = i
for j in range(n_top_words):
ws.cell(row=i+2, column=j+2).value = dist[i][j]
# 保存Excel文件
wb.save(filename)
```
注意,这里需要使用 lda.n_components 属性来获取主题数,因为 lda.transform() 返回的数组的第一维的长度就等于主题数。
###########每个主题对应词语 n_top_words = 30 tf_feature_names = tf_vectorizer.get_feature_names() topic_word = print_top_words(lda, tf_feature_names, n_top_words)添加输出每个主题下词语的概率分布(归一化处理)至新的Excel表代码
这是一个比较复杂的任务,需要使用Python的pandas库和openpyxl库来实现。下面是一个示例代码,假设已经有了主题模型lda和tf_vectorizer。
```python
import pandas as pd
from openpyxl import Workbook
# 获取主题下词语的概率分布
def get_topic_word_distribution(lda, tf_feature_names):
arr = lda.transform(tf_vectorizer.transform([' '.join(tf_feature_names)]))
return arr[0]
# 打印主题下词语的概率分布
def print_topic_word_distribution(lda, tf_feature_names, n_top_words):
dist = get_topic_word_distribution(lda, tf_feature_names)
for i in range(lda.n_topics):
print("Topic {}: {}".format(i, ', '.join("{:.4f}".format(x) for x in dist[i])))
# 输出每个主题下词语的概率分布至Excel表格
def output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, filename):
# 创建Excel工作簿和工作表
wb = Workbook()
ws = wb.active
ws.title = "Topic Word Distribution"
# 添加表头
ws.cell(row=1, column=1).value = "Topic"
for j in range(n_top_words):
ws.cell(row=1, column=j+2).value = tf_feature_names[j]
# 添加每个主题下词语的概率分布
dist = get_topic_word_distribution(lda, tf_feature_names)
for i in range(lda.n_topics):
ws.cell(row=i+2, column=1).value = i
for j in range(n_top_words):
ws.cell(row=i+2, column=j+2).value = dist[i][j]
# 保存Excel文件
wb.save(filename)
# 示例代码
n_top_words = 30
tf_feature_names = tf_vectorizer.get_feature_names()
print_topic_word_distribution(lda, tf_feature_names, n_top_words)
output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, "topic_word_distribution.xlsx")
```
在示例代码中,我们首先定义了一个函数get_topic_word_distribution来获取主题下词语的概率分布。该函数使用lda.transform方法将词向量矩阵转化为主题分布矩阵,然后返回第一行(也就是我们需要的概率分布数组)。接着定义了一个函数print_topic_word_distribution来打印所有主题下词语的概率分布。最后定义了一个函数output_topic_word_distribution_to_excel来输出主题下词语的概率分布至Excel表格。
在output_topic_word_distribution_to_excel函数中,我们首先创建了一个Excel工作簿和工作表,并添加了表头。然后使用get_topic_word_distribution函数获取主题下词语的概率分布,并循环添加到Excel表格中。最后使用wb.save方法保存Excel文件。
请注意,由于Excel文件的写入速度比较慢,如果处理的数据量比较大,可能需要等待一段时间才能完成操作。