import pandas as pd import numpy as np import re import jieba from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt# 读取数据 # df = df[['text', 'label']] # df = df.dropna()# 分组 # groups = df.groupby('label')# 加载停用词 stop_words = set() with open('C:/Users/Administrator/Desktop/停用词/stopwords.txt', 'r', encoding='utf-8') as f: for line in f.readlines(): stop_words.add(line.strip())# 定义函数,用于去除停用词和提取关键词 def extract_keywords(text): words = [word for word in jieba.cut(text) if word not in stop_words] return Counter(words).most_common(10)# 提取关键词 keywords = {} for name, group in groups: keywords[name] = [] for text in group['text']: keywords[name].extend(extract_keywords(text)) keywords[name] = dict(keywords[name])# 绘制词云 for name, words in keywords.items(): wordcloud = WordCloud(width=800, height=400, font_path='msyh.ttc').generate_from_frequencies(words) plt.figure(figsize=(12, 8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.title(name) plt.show() 代码对吗
时间: 2024-02-14 09:10:56 浏览: 147
这段代码有一些问题,需要进行修改:
1. 导入模块时应该每个模块占一行,不要在同一行使用分号分隔。
2. 加载停用词的代码中,应该使用 `with open(...) as f:` 的语法,而不是先打开文件再关闭文件。
3. 定义函数 `extract_keywords` 时,应该将停用词集合作为参数传入。
4. 提取关键词时,应该过滤掉单个字和数字。
5. 绘制词云时,应该设置词云的背景颜色和最大字体大小。
修改后的代码如下所示:
```python
import pandas as pd
import numpy as np
import re
import jieba
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 读取数据
df = pd.read_csv('data.csv')
df = df[['text', 'label']]
df = df.dropna()
# 分组
groups = df.groupby('label')
# 加载停用词
stop_words = set()
with open('C:/Users/Administrator/Desktop/停用词/stopwords.txt', 'r', encoding='utf-8') as f:
for line in f:
stop_words.add(line.strip())
# 定义函数,用于去除停用词和提取关键词
def extract_keywords(text, stop_words):
words = [word for word in jieba.cut(text) if word not in stop_words and len(word) > 1 and not word.isdigit()]
return Counter(words).most_common(10)
# 提取关键词
keywords = {}
for name, group in groups:
keywords[name] = []
for text in group['text']:
keywords[name].extend(extract_keywords(text, stop_words))
keywords[name] = dict(keywords[name])
# 绘制词云
for name, words in keywords.items():
wordcloud = WordCloud(width=800, height=400, font_path='msyh.ttc', background_color='white', max_font_size=100).generate_from_frequencies(words)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(name)
plt.show()
```
请注意,这段代码仅供参考,具体的修改还需要根据实际情况进行调整。
阅读全文