首先确保已经安装了`jieba`和`nltk`库,可以分别通过`pip install jieba`和`pip install nltk`进行安装。然后,需要下载NLTK的停用词列表:
import pandas as pd
import re
import jieba
from jieba.analyse import ChineseSegmenter, stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# 下载停用词表
# 1. 读取情感词汇本体库
def read_excel_vocab(file_path):
vocab_df = pd.read_excel(file_path, engine='openpyxl') # 假设列名为"Negative Words"
negative_words = set(vocab_df['Negative Words'].tolist())
return negative_words
# 加载停用词和分词器
stop_words_list = list(stopwords.words('chinese'))
cs = ChineseSegmenter()
# 2. 读取txt文件内容并进行分词和停用词去除
def extract_negative_text(input_file, output_file, vocab_set, stop_words):
with open(input_file, 'r', encoding='utf-8') as f:
text = f.read()
# 分词并去除停用词
tokenized_text = [word for segment in cs.cut(text) for word in segment if word not in stop_words and word in vocab_set]
# 写入新文件
with open(output_file, 'w', encoding='utf-8') as f:
f.write(' '.join(tokenized_text))
# 3. 调用函数
negative_words = read_excel_vocab('大连理工大学情感词汇本体库.xlsx')
input_txt_file = 'your_input.txt'
output_negatives_txt_file = 'negative_texts.txt'
extract_negative_text(input_txt_file, output_negatives_txt_file, negative_words, stop_words_list)