Python LDA话题模型:分词与文本处理

需积分: 0 0 下载量 29 浏览量 更新于2024-08-03 收藏 4KB MD 举报
本资源主要介绍了如何使用Python和LDA(Latent Dirichlet Allocation)话题模型进行文本分析。首先,文档开始通过`jieba`库对给定的文本内容进行分词处理,并移除停用词,以提高后续分析的效率。以下是分词处理的关键步骤: 1. 定义停用词列表,通常包含常见的无意义词汇,如“的”、“是”等。 2. 使用`jieba.cut()`函数对文本进行切词,并通过`filtered_words`列表筛选出不在停用词列表中的单词。 3. 将清洗后的分词结果保存到新的文本文件中。 接着,导入必要的算法包,包括`gensim`库,这是实现LDA模型的核心工具。这里导入了`corpora`模块用于处理文档数据,以及`matplotlib`库用于可视化结果。为了减少警告信息,`warnings.filterwarnings('ignore')`被用来忽略可能出现的警告。 在预处理阶段,将原始文本数据加载并转换成适合LDA模型处理的格式。具体操作是: 1. 读取文本文件`123.txt`,并使用`\n`作为分隔符将其分割成一个包含多个自然段的列表。 2. 每个子列表代表一个自然段,这样就形成了文档的词袋表示形式,即将每个文档视为由词语构成的一系列向量。 最后,导入`CoherenceModel`和`LdaModel`,这两个类分别用于评估模型的连贯性和执行LDA主题建模。接下来将构建LDA模型,设置合适的参数(如主题数量、迭代次数等),并计算主题的内在一致性(coherence score)。完整的流程可能包括以下步骤: - 初始化LDA模型,设置主题数和其他参数。 - 使用`LdaModel.fit()`方法训练模型,将文档数据输入模型。 - 计算主题的连贯性,这有助于评估模型的质量。 - 可能还会涉及可视化,如绘制主题分布或关键词云图,以便直观理解模型发现的主题。 整个过程旨在从给定文本中提取潜在的主题结构,帮助我们理解文本内容的潜在组织和模式,这对于新闻聚合、文档摘要、情感分析等领域非常有用。

n_topics = 10 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50, learning_method='batch', learning_offset=50, #doc_topic_prior=0.1, #topic_word_prior=0.01, random_state=0) lda.fit(tf) ###########每个主题对应词语 import pandas as pd from openpyxl import Workbook # 获取主题下词语的概率分布 def get_topic_word_distribution(lda, tf_feature_names): arr = lda.transform(tf_vectorizer.transform([' '.join(tf_feature_names)])) return arr[0] # 打印主题下词语的概率分布 def print_topic_word_distribution(lda, tf_feature_names, n_top_words): dist = get_topic_word_distribution(lda, tf_feature_names) for i in range(lda.n_topics): print("Topic {}: {}".format(i, ', '.join("{:.4f}".format(x) for x in dist[i]))) # 输出每个主题下词语的概率分布至Excel表格 def output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, filename): # 创建Excel工作簿和工作表 wb = Workbook() ws = wb.active ws.title = "Topic Word Distribution" # 添加表头 ws.cell(row=1, column=1).value = "Topic" for j in range(n_top_words): ws.cell(row=1, column=j+2).value = tf_feature_names[j] # 添加每个主题下词语的概率分布 dist = get_topic_word_distribution(lda, tf_feature_names) for i in range(lda.n_topics): ws.cell(row=i+2, column=1).value = i for j in range(n_top_words): ws.cell(row=i+2, column=j+2).value = dist[i][j] # 保存Excel文件 wb.save(filename) n_top_words = 30 tf_feature_names = tf_vectorizer.get_feature_names() topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words) #print_topic_word_distribution(lda, tf_feature_names, n_top_words) output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, "topic_word_distribution.xlsx")报错Traceback (most recent call last): File "D:\python\lda3\data_1.py", line 157, in <module> topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words) File "D:\python\lda3\data_1.py", line 129, in print_topic_word_distribution for i in range(lda.n_topics): AttributeError: 'LatentDirichletAllocation' object has no attribute 'n_topics'

2023-05-25 上传

import pandas as pd from openpyxl import Workbook # 获取主题下词语的概率分布 def get_topic_word_distribution(lda, tf_feature_names): arr = lda.transform(tf_vectorizer.transform([' '.join(tf_feature_names)])) return arr[0] # 打印主题下词语的概率分布 def print_topic_word_distribution(lda, tf_feature_names, n_top_words,n_topics): dist = get_topic_word_distribution(lda, tf_feature_names,n_topics) for i in range(n_topics): print("Topic {}: {}".format(i, ', '.join("{:.4f}".format(x) for x in dist[i]))) # 输出每个主题下词语的概率分布至Excel表格 def output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, n_topics,filename): # 创建Excel工作簿和工作表 wb = Workbook() ws = wb.active ws.title = "Topic Word Distribution" # 添加表头 ws.cell(row=1, column=1).value = "Topic" for j in range(n_top_words): ws.cell(row=1, column=j+2).value = tf_feature_names[j] # 添加每个主题下词语的概率分布 dist = get_topic_word_distribution(lda, tf_feature_names, n_topics) for i in range(n_topics): ws.cell(row=i+2, column=1).value = i for j in range(n_top_words): ws.cell(row=i+2, column=j+2).value = dist[i][j] # 保存Excel文件 wb.save(filename) n_top_words = 30 tf_feature_names = tf_vectorizer.get_feature_names() topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words, n_topics) #print_topic_word_distribution(lda, tf_feature_names, n_top_words) output_topic_word_distribution_to_excel(lda, tf_feature_names, n_top_words, n_topics, "topic_word_distribution.xlsx")报错Traceback (most recent call last): File "D:\python\lda3\data_1.py", line 157, in <module> topic_word = print_topic_word_distribution(lda, tf_feature_names, n_top_words, n_topics) File "D:\python\lda3\data_1.py", line 128, in print_topic_word_distribution dist = get_topic_word_distribution(lda, tf_feature_names,n_topics) TypeError: get_topic_word_distribution() takes 2 positional arguments but 3 were given

2023-05-25 上传

把这段代码的PCA换成LDA:LR_grid = LogisticRegression(max_iter=1000, random_state=42) LR_grid_search = GridSearchCV(LR_grid, param_grid=param_grid, cv=cvx ,scoring=scoring,n_jobs=10,verbose=0) LR_grid_search.fit(pca_X_train, train_y) estimators = [ ('lr', LR_grid_search.best_estimator_), ('svc', svc_grid_search.best_estimator_), ] clf = StackingClassifier(estimators=estimators, final_estimator=LinearSVC(C=5, random_state=42),n_jobs=10,verbose=1) clf.fit(pca_X_train, train_y) estimators = [ ('lr', LR_grid_search.best_estimator_), ('svc', svc_grid_search.best_estimator_), ] param_grid = {'final_estimator':[LogisticRegression(C=0.00001),LogisticRegression(C=0.0001), LogisticRegression(C=0.001),LogisticRegression(C=0.01), LogisticRegression(C=0.1),LogisticRegression(C=1), LogisticRegression(C=10),LogisticRegression(C=100), LogisticRegression(C=1000)]} Stacking_grid =StackingClassifier(estimators=estimators,) Stacking_grid_search = GridSearchCV(Stacking_grid, param_grid=param_grid, cv=cvx, scoring=scoring,n_jobs=10,verbose=0) Stacking_grid_search.fit(pca_X_train, train_y) Stacking_grid_search.best_estimator_ train_pre_y = cross_val_predict(Stacking_grid_search.best_estimator_, pca_X_train,train_y, cv=cvx) train_res1=get_measures_gridloo(train_y,train_pre_y) test_pre_y = Stacking_grid_search.predict(pca_X_test) test_res1=get_measures_gridloo(test_y,test_pre_y) best_pca_train_aucs.append(train_res1.loc[:,"AUC"]) best_pca_test_aucs.append(test_res1.loc[:,"AUC"]) best_pca_train_scores.append(train_res1) best_pca_test_scores.append(test_res1) train_aucs.append(np.max(best_pca_train_aucs)) test_aucs.append(best_pca_test_aucs[np.argmax(best_pca_train_aucs)].item()) train_scores.append(best_pca_train_scores[np.argmax(best_pca_train_aucs)]) test_scores.append(best_pca_test_scores[np.argmax(best_pca_train_aucs)]) pca_comp.append(n_components[np.argmax(best_pca_train_aucs)]) print("n_components:") print(n_components[np.argmax(best_pca_train_aucs)])

2023-07-22 上传
2023-06-09 上传