写一个自然语言处理大作业的代码100行代码
时间: 2023-10-18 07:04:21 浏览: 87
好的,这里提供一个简单的例子:实现一个基于TF-IDF的文本分类器,根据输入的文本将其分类为多个类别中的一种。
```python
import os
import re
import math
import random
import argparse
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# 设置随机种子,保证每次运行结果一致
random.seed(0)
# 命令行参数设置
parser = argparse.ArgumentParser(description='TF-IDF based text classifier')
parser.add_argument('--data_dir', default='./data', help='Directory of training data')
parser.add_argument('--test_dir', default='./test', help='Directory of testing data')
parser.add_argument('--stopwords_file', default='./stopwords.txt', help='File of stopwords')
parser.add_argument('--n_class', type=int, default=5, help='Number of classes')
parser.add_argument('--n_top_words', type=int, default=1000, help='Number of top words')
parser.add_argument('--n_train', type=int, default=1000, help='Number of training samples')
parser.add_argument('--n_test', type=int, default=500, help='Number of testing samples')
args = parser.parse_args()
# 读取停用词表
with open(args.stopwords_file, 'r', encoding='utf-8') as f:
stopwords_list = [line.strip() for line in f.readlines()]
# 读取训练数据
train_data = []
for i in range(args.n_class):
class_name = 'class{}'.format(i+1)
class_dir = os.path.join(args.data_dir, class_name)
file_list = os.listdir(class_dir)
for file_name in file_list:
file_path = os.path.join(class_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
train_data.append((content, i))
# 随机打乱数据
random.shuffle(train_data)
# 选取前n_train个样本作为训练集,后n_test个样本作为测试集
train_data = train_data[:args.n_train]
test_data = train_data[-args.n_test:]
# 统计每个类别中每个词的出现次数
word_count = defaultdict(lambda: defaultdict(int))
class_count = defaultdict(int)
for content, label in train_data:
words = [word.lower() for word in word_tokenize(content) if word.isalpha() and word.lower() not in stopwords_list]
class_count[label] += 1
for word in words:
word_count[label][word] += 1
# 计算每个词的TF-IDF权重
word_tf_idf = defaultdict(lambda: defaultdict(float))
for label in range(args.n_class):
n_docs = class_count[label]
for word in word_count[label]:
tf = word_count[label][word] / sum(word_count[label].values())
idf = math.log(args.n_train / sum([1 for i in range(args.n_class) if word in word_count[i]]))
word_tf_idf[label][word] = tf * idf
# 选取每个类别中TF-IDF权重最高的n_top_words个词作为特征
features = set()
for label in range(args.n_class):
word_tf_idf_sorted = sorted(word_tf_idf[label].items(), key=lambda x: x[1], reverse=True)
for i in range(args.n_top_words):
features.add(word_tf_idf_sorted[i][0])
# 将文本转化为特征表示,即每个特征的TF-IDF权重
def text_to_feature(text):
words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stopwords_list]
feature = {}
for word in words:
if word in features:
feature[word] = word_tf_idf[label][word]
return feature
# 将训练数据和测试数据转化为特征表示
train_features = [(text_to_feature(content), label) for content, label in train_data]
test_features = [(text_to_feature(content), label) for content, label in test_data]
# 训练朴素贝叶斯分类器
class NaiveBayesClassifier:
def __init__(self):
self.class_prior = {}
self.feature_count = defaultdict(lambda: defaultdict(float))
self.class_count = defaultdict(float)
self.feature_prob = defaultdict(lambda: defaultdict(float))
def train(self, data):
for feature, label in data:
self.class_count[label] += 1
for word in feature:
self.feature_count[label][word] += feature[word]
n_samples = sum(self.class_count.values())
for label in self.class_count:
self.class_prior[label] = self.class_count[label] / n_samples
for word in self.feature_count[label]:
self.feature_prob[label][word] = (self.feature_count[label][word] + 1) / (sum(self.feature_count[label].values()) + len(features))
def predict(self, feature):
scores = {label: math.log(self.class_prior[label]) for label in self.class_prior}
for word in feature:
if word in features:
for label in self.class_prior:
scores[label] += math.log(self.feature_prob[label][word]) * feature[word]
return max(scores, key=scores.get)
classifier = NaiveBayesClassifier()
classifier.train(train_features)
# 在测试集上评估分类器的准确率
correct = 0
for feature, label in test_features:
pred = classifier.predict(feature)
if pred == label:
correct += 1
accuracy = correct / len(test_features)
print("Accuracy:", accuracy)
```
这个例子实现了一个基于TF-IDF的文本分类器,包括读取数据、预处理、特征提取、模型训练和测试等步骤。其中用到了NLTK库中的分词、停用词过滤等工具,以及朴素贝叶斯分类器的训练和预测方法。需要注意的是,这只是一个简单的例子,实际文本分类需要考虑更多的因素,比如特征选择、模型调参等。
阅读全文