对下列代码增加功能,使其可以计算每一行每一个词的密度以及总词数的密度 import pandas as pd import re from collections import Counter # 读取词典txt文件 with open('词典.txt', 'r', encoding='utf-8') as f: list_dict = [line.strip() for line in f] # 读取F列数据 df = pd.read_excel('实验体.xlsx') list_f = df['Answer'].tolist() # 统计每行文本中的词频 dict_count_list = [] for text in list_f: # 匹配文本中的词列表 text = str(text) words = re.findall('|'.join(list_dict), text) # 统计每个词在该行文本中的出现次数 dict_count = Counter(words) dict_count_list.append(dict_count) # 将每行文本的词频统计结果合并为一个DataFrame对象 df_count = pd.DataFrame(dict_count_list) df_count.index = df.index # 输出为Excel文件 writer = pd.ExcelWriter('数实验体100.xlsx') df_count.to_excel(writer, sheet_name='Sheet1') writer._save()
时间: 2024-03-03 14:52:17 浏览: 56
以下是增加功能后的代码:
import pandas as pd
import re
from collections import Counter
# 读取词典txt文件
with open('词典.txt', 'r', encoding='utf-8') as f:
list_dict = [line.strip() for line in f]
# 读取F列数据
df = pd.read_excel('实验体.xlsx')
list_f = df['Answer'].tolist()
# 统计每行文本中的词频
dict_count_list = []
for text in list_f:
# 匹配文本中的词列表
text = str(text)
words = re.findall('|'.join(list_dict), text)
# 统计每个词在该行文本中的出现次数
dict_count = Counter(words)
dict_count_list.append(dict_count)
# 将每行文本的词频统计结果合并为一个DataFrame对象
df_count = pd.DataFrame(dict_count_list)
df_count.index = df.index
# 计算每一行每一个词的密度
df_density = pd.DataFrame(columns=list(df_count.columns))
for i in range(len(df_count)):
line = df_count.iloc[i]
density = line / line.sum()
df_density = df_density.append(density, ignore_index=True)
# 计算每一行总词数密度和每一个数字化词的密度
total_word_count = df_count.sum(axis=1)
total_word_density = total_word_count / total_word_count.sum()
numeric_word_count = df_count.select_dtypes(include=['int']).sum(axis=1)
numeric_word_density = numeric_word_count / total_word_count
# 将密度数据添加到原DataFrame对象
df_count['Total Word Density'] = total_word_density
df_count['Numeric Word Density'] = numeric_word_density
df_count = pd.concat([df_count, df_density], axis=1)
# 输出为Excel文件
writer = pd.ExcelWriter('数实验体100.xlsx')
df_count.to_excel(writer, sheet_name='Sheet1')
writer._save()
现在,每行每一个词的密度以及总词数密度和每一个数字化词的密度都计算出来了,并且已经被添加到原DataFrame对象中。
阅读全文