修改 import jieba import jieba.posseg as pseg path = r'D:/明朝collection.txt' with open(path, 'r', encoding='utf-8') as f: text = f.read() jieba.enable_paddle() words = pseg.cut(text, use_paddle=True) counts = {} for word in words: if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(500): word, count = items[i] print("{0:<5}{1:>5}".format(word, count))
时间: 2024-04-09 09:31:56 浏览: 113
import jieba.posseg as pseg
path = r'D:/明朝collection.txt'
with open(path, 'r', encoding='utf-8') as f:
text = f.read()
jieba.enable_paddle()
words = pseg.cut(text, use_paddle=True)
counts = {}
for word, flag in words:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(500):
word, count = items[i]
print("{0:<5}{1:>5}".format(word, count))
相关问题
import requests from bs4 import BeautifulSoup import jieba.analyse import jieba.posseg as pseg from snownlp import SnowNLP import matplotlib.pyplot as plt # 设置请求头,模拟浏览器访问 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} # 获取网页内容 def get_html(url): resp = requests.get(url, headers=headers) resp.encoding = resp.apparent_encoding html = resp.text return html # 获取新闻列表 def get_news_list(url): html = get_html(url) soup = BeautifulSoup(html, 'html.parser') news_list = soup.find_all('a', class_="news_title") return news_list # 对文本进行情感分析 def sentiment_analysis(text): s = SnowNLP(text) return s.sentiments # 对文本进行关键词提取 def keyword_extraction(text): keywords = jieba.analyse.extract_tags(text, topK=10, withWeight=True, allowPOS=('n', 'vn', 'v')) return keywords # 对新闻进行分析 def analyze_news(url): news_list = get_news_list(url) senti_scores = [] # 情感分数列表 keyword_dict = {} # 关键词词频字典 for news in news_list: title = news.get_text().strip() link = news['href'] content = get_html(link) soup = BeautifulSoup(content, 'html.parser') text = soup.find('div', class_='article').get_text().strip() # 计算情感分数 senti_score = sentiment_analysis(text) senti_scores.append(senti_score) # 提取关键词 keywords = keyword_extraction(text) for keyword in keywords: if keyword[0] in keyword_dict: keyword_dict[keyword[0]] += keyword[1] else: keyword_dict[keyword[0]] = keyword[1] # 绘制情感分数直方图 plt.hist(senti_scores, bins=10, color='skyblue') plt.xlabel('Sentiment Score') plt.ylabel('Number of News') plt.title('Sentiment Analysis') plt.show() # 输出关键词词频排名 keyword_list = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True) print('Top 10 keywords:') for i in range(10): print('{}. {} - {:.2f}'.format(i+1, keyword_list[i][0], keyword_list[i][1])) if __name__ == '__main__': url = 'https://www.sina.com.cn/' analyze_news(url)
这是一段Python代码,用于对新闻进行情感分析和关键词提取。它使用了requests库来获取网页内容,使用BeautifulSoup库来解析HTML文档,使用jieba库来进行中文分词和关键词提取,使用SnowNLP库来进行情感分析,使用matplotlib库来绘制情感分数直方图。在主函数中,它调用了get_news_list()函数来获取新闻列表,然后对每篇新闻进行情感分析和关键词提取,并将情感分数和关键词词频存储到列表和字典中。最后,它绘制情感分数直方图,输出关键词词频排名。
为下面这段代码的预测结果加上可视化功能,要能够看到每个预测数据的结果的准确度:from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB import jieba from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt good_comments = [] bad_comments = [] with open('D:\PyCharmProjects\爬虫测试\好评.txt', 'r', encoding='gbk') as f: for line in f.readlines(): good_comments.append(line.strip('\n')) with open('D:\PyCharmProjects\爬虫测试\差评.txt', 'r', encoding='gbk') as f: for line in f.readlines(): bad_comments.append(line.strip('\n')) with open('StopWords.txt', 'r', encoding='utf-8') as f: stopwords = f.read().splitlines() good_words = [] for line in good_comments: words = jieba.cut(line, cut_all=False) words = [w for w in words if w not in stopwords] good_words.append(' '.join(words)) bad_words = [] for line in bad_comments: words = jieba.cut(line, cut_all=False) words = [w for w in words if w not in stopwords] bad_words.append(' '.join(words)) # 将文本转换为向量 vectorizer = CountVectorizer() X = vectorizer.fit_transform(good_words + bad_words) y = [1] * len(good_words) + [0] * len(bad_words) # 将数据分为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 训练模型 clf = MultinomialNB() clf.fit(X_train, y_train) # 测试模型并计算准确率 pred = clf.predict(X_test) accuracy = sum(pred == y_test) / len(y_test) print('准确率:{:.2%}'.format(accuracy)) # 预测新数据的类别 with open('测试评论.txt', 'r', encoding='utf-8') as f: count = 0 for line in f.readlines(): count += 1 test_text = line.strip('\n') test_words = ' '.join(jieba.cut(test_text, cut_all=False)) test_vec = vectorizer.transform([test_words]) pred = clf.predict(test_vec) if pred[0] == 1: print(count, '好评') else: print(count, '差评')
以下是加上可视化功能的代码,通过图表的形式展示每个预测数据的结果的准确度:
```python
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import jieba
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
# 读取数据
good_comments = []
bad_comments = []
with open('D:\PyCharmProjects\爬虫测试\好评.txt', 'r', encoding='gbk') as f:
for line in f.readlines():
good_comments.append(line.strip('\n'))
with open('D:\PyCharmProjects\爬虫测试\差评.txt', 'r', encoding='gbk') as f:
for line in f.readlines():
bad_comments.append(line.strip('\n'))
# 加载停用词
with open('StopWords.txt', 'r', encoding='utf-8') as f:
stopwords = f.read().splitlines()
# 对评论进行分词
good_words = []
for line in good_comments:
words = jieba.cut(line, cut_all=False)
words = [w for w in words if w not in stopwords]
good_words.append(' '.join(words))
bad_words = []
for line in bad_comments:
words = jieba.cut(line, cut_all=False)
words = [w for w in words if w not in stopwords]
bad_words.append(' '.join(words))
# 将文本转换为向量
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(good_words + bad_words)
y = [1] * len(good_words) + [0] * len(bad_words)
# 将数据分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 训练模型
clf = MultinomialNB()
clf.fit(X_train, y_train)
# 测试模型并计算准确率
pred = clf.predict(X_test)
accuracy = sum(pred == y_test) / len(y_test)
print('准确率:{:.2%}'.format(accuracy))
# 预测新数据的类别
with open('测试评论.txt', 'r', encoding='utf-8') as f:
count = 0
results = []
for line in f.readlines():
count += 1
test_text = line.strip('\n')
test_words = ' '.join(jieba.cut(test_text, cut_all=False))
test_vec = vectorizer.transform([test_words])
pred = clf.predict(test_vec)
if pred[0] == 1:
results.append(('好评', 1))
else:
results.append(('差评', 0))
# 可视化预测结果准确度
y_test = np.array(y_test)
pred = np.array(pred)
fig, ax = plt.subplots()
ax.scatter(np.arange(len(y_test)), y_test, label='True Label')
ax.scatter(np.arange(len(pred)), pred, label='Predicted Label')
ax.set_xlabel('Sample Index')
ax.set_ylabel('Label')
ax.legend()
plt.show()
```
运行后,会展示出一个图表,其中横坐标表示样本的索引,纵坐标表示样本的标签。绿色点表示预测结果,蓝色点表示真实结果。可以通过比较两者的位置和颜色,来判断预测的准确率和错误率。
阅读全文