words_list = list("".join(words)) ^^^^^^^^^^^^^^^^^^^^ TypeError: 'Series' object is not callable
时间: 2023-11-13 13:44:25 浏览: 122
words_list = list("".join(words.tolist()))
如何修改代码,使得输出的每一个词的对应词频和密度分成两列显示 import pandas as pd import re from collections import Counter 读取词典txt文件 with open('词典.txt', 'r', encoding='utf-8') as f: list_dict = [line.strip() for line in f] 读取F列数据 df = pd.read_excel('实验体.xlsx') list_f = df['Answer'].tolist() 统计每行文本中的词频 dict_count_list = [] total_density_list = [] for text in list_f: # 匹配文本中的词列表 text = str(text) words = re.findall('|'.join(list_dict), text) # 统计每个词在该行文本中的出现次数和密度 dict_count = Counter(words) dict_count_list.append(dict_count) dict_count = {} for word in words: count = text.count(word) density = count / len(text) dict_count[word] = {'count': count, 'density': density} dict_count_list.append(dict_count) # 计算每行总词数的密度 total_density = sum([v['density'] for v in dict_count.values()]) total_density_list.append(total_density) 将每行文本的词频统计结果合并为一个DataFrame对象 df_count = pd.DataFrame(dict_count_list) df_count.index = df.index 输出为Excel文件 writer = pd.ExcelWriter('数实验体10.xlsx') df_count.to_excel(writer, sheet_name='Sheet1') writer._save()
import pandas as pd
import re
from collections import Counter
# 读取词典txt文件
with open('词典.txt', 'r', encoding='utf-8') as f:
list_dict = [line.strip() for line in f]
# 读取F列数据
df = pd.read_excel('实验体.xlsx')
list_f = df['Answer'].tolist()
# 统计每行文本中的词频和密度
dict_count_list = []
total_density_list = []
for text in list_f:
# 匹配文本中的词列表
text = str(text)
words = re.findall('|'.join(list_dict), text)
# 统计每个词在该行文本中的出现次数和密度
dict_count = Counter(words)
dict_count = {k: {'count': v, 'density': v / len(text)} for k, v in dict_count.items()}
# 计算每行总词数的密度
total_density = sum([v['density'] for v in dict_count.values()])
# 将每行文本的词频统计结果合并为一个DataFrame对象
df_count = pd.concat([pd.DataFrame.from_dict(d, orient='index') for d in dict_count_list], axis=1)
df_count.index = df.index
# 输出为Excel文件并将每个词的词频和密度分成两列显示
with pd.ExcelWriter('数实验体10.xlsx') as writer:
df_count.to_excel(writer, sheet_name='Sheet1')
for col in df_count.columns:
if col.endswith('count'):
count_col = col[:-7] + 'count'
density_col = col[:-7] + 'density'
df_count[[count_col, density_col]] = df_count[col].apply(pd.Series)
df_count = df_count.drop(col, axis=1)
df_count.to_excel(writer, sheet_name='Sheet1', startrow=0, startcol=len(df_count.columns))
优化这段代码:import requests from bs4 import BeautifulSoup import jieba url = "http://xc.hfut.edu.cn/1955/list{}.htm" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"} news_list = [] for i in range(1, 6): # 爬取前5页的新闻标题 res = requests.get(url.format(i), headers=headers) soup = BeautifulSoup(res.text, "html.parser") news = soup.find_all("span", {"class": "news_title"}) for n in news: news_list.append(n.a.string) # 对新闻标题进行分词 words_list = [] for news in news_list: words = jieba.cut(news) for word in words: words_list.append(word) from wordcloud import WordCloud import matplotlib.pyplot as plt from PIL import Image import numpy as np # 读入背景图片 image = Image.open("C:\\xhktSoft\huahua.jpg") graph = np.array(image) # 设置停用词 stop_words = ["的", "是", "在", "了", "和", "与", "也", "还", "有", "就", "等", "中", "及", "对", "是"] # 生成词云图 wc = WordCloud(font_path="msyh.ttc", background_color='white', max_words=200, mask=graph, stopwords=stop_words, max_font_size=200, random_state=42) wc.generate_from_text(" ".join(words_list)) # 绘制词云图 plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show()
import requests
from bs4 import BeautifulSoup
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
# 定义函数获取新闻标题
def get_news_titles(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
news_list = []
for i in range(1, 6):
res = requests.get(url.format(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
news = soup.find_all("span", {"class": "news_title"})
for n in news:
return news_list
# 定义函数对新闻标题进行分词
def cut_words(news_list):
words_list = []
for news in news_list:
words = jieba.cut(news)
for word in words:
return words_list
# 定义函数生成词云图
def generate_wordcloud(words_list, graph):
stop_words = ["的", "是", "在", "了", "和", "与", "也", "还", "有", "就", "等", "中", "及", "对", "是"]
wc = WordCloud(font_path="msyh.ttc", background_color='white', max_words=200, mask=graph, stopwords=stop_words, max_font_size=200, random_state=42)
wc.generate_from_text(" ".join(words_list))
plt.imshow(wc, interpolation='bilinear')
# 主函数
if __name__ == '__main__':
url = "http://xc.hfut.edu.cn/1955/list{}.htm"
news_list = get_news_titles(url)
words_list = cut_words(news_list)
graph = np.array(Image.open("C:\\xhktSoft\huahua.jpg"))
generate_wordcloud(words_list, graph)