时间: 2023-10-20 12:07:40 浏览: 108
1. 准备数据集
2. 数据预处理
import jieba
import codecs
import os
# 加载停用词表
def load_stopwords(stopwords_path):
stopwords = []
with codecs.open(stopwords_path, 'r', encoding='utf-8') as f:
for line in f:
return stopwords
# 对文本进行分词和去停用词
def cut_words(text_path, stopwords_path):
stopwords = load_stopwords(stopwords_path)
cut_text = []
with codecs.open(text_path, 'r', encoding='utf-8') as f:
for line in f:
words = jieba.cut(line.strip())
words = [word for word in words if word not in stopwords]
cut_text.append(' '.join(words))
return cut_text
3. 特征提取
from sklearn.feature_extraction.text import TfidfVectorizer
# 对文本进行TF-IDF特征提取
def tfidf_transformer(cut_text):
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cut_text)
return tfidf_matrix.toarray()
4. 模型训练和预测
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# 训练朴素贝叶斯分类器
def train(X_train, y_train):
clf = MultinomialNB()
clf.fit(X_train, y_train)
return clf
# 预测
def predict(clf, X_test):
y_pred = clf.predict(X_test)
return y_pred
# 计算准确率
def evaluate(y_true, y_pred):
acc = accuracy_score(y_true, y_pred)
return acc
import jieba
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# 加载停用词表
def load_stopwords(stopwords_path):
stopwords = []
with codecs.open(stopwords_path, 'r', encoding='utf-8') as f:
for line in f:
return stopwords
# 对文本进行分词和去停用词
def cut_words(text_path, stopwords_path):
stopwords = load_stopwords(stopwords_path)
cut_text = []
with codecs.open(text_path, 'r', encoding='utf-8') as f:
for line in f:
words = jieba.cut(line.strip())
words = [word for word in words if word not in stopwords]
cut_text.append(' '.join(words))
return cut_text
# 对文本进行TF-IDF特征提取
def tfidf_transformer(cut_text):
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cut_text)
return tfidf_matrix.toarray()
# 训练朴素贝叶斯分类器
def train(X_train, y_train):
clf = MultinomialNB()
clf.fit(X_train, y_train)
return clf
# 预测
def predict(clf, X_test):
y_pred = clf.predict(X_test)
return y_pred
# 计算准确率
def evaluate(y_true, y_pred):
acc = accuracy_score(y_true, y_pred)
return acc
if __name__ == '__main__':
# 准备数据集
data_dir = './data'
train_text_path = os.path.join(data_dir, 'train.txt')
test_text_path = os.path.join(data_dir, 'test.txt')
stopwords_path = os.path.join(data_dir, 'stopwords.txt')
# 数据预处理
X_train_cut = cut_words(train_text_path, stopwords_path)
X_test_cut = cut_words(test_text_path, stopwords_path)
# 特征提取
X_train = tfidf_transformer(X_train_cut)
X_test = tfidf_transformer(X_test_cut)
# 训练模型
y_train = [int(line.split('\t')[0]) for line in codecs.open(train_text_path, 'r', encoding='utf-8')]
clf = train(X_train, y_train)
# 预测和评估
y_true = [int(line.split('\t')[0]) for line in codecs.open(test_text_path, 'r', encoding='utf-8')]
y_pred = predict(clf, X_test)
acc = evaluate(y_true, y_pred)
print('accuracy: {:.2f}%'.format(acc * 100))