基于神经网络的THUCNews数据集文本分类的python代码
时间: 2024-05-11 18:18:13 浏览: 138
NLP实战之基于tensorflow+字符级CNN进行THUCNews文本分类python
以下是基于神经网络的THUCNews数据集文本分类的Python代码示例:
首先,我们需要加载数据集。可以从THUCNews官网下载数据集。
```python
import os
import numpy as np
# 加载数据集
def load_data(data_path):
train_texts, train_labels = [], []
test_texts, test_labels = [], []
categories = os.listdir(data_path)
for i, category in enumerate(categories):
category_path = os.path.join(data_path, category)
file_names = os.listdir(category_path)
for file_name in file_names:
with open(os.path.join(category_path, file_name), 'r', encoding='utf-8') as f:
content = f.read()
if file_name.startswith('train'):
train_texts.append(content)
train_labels.append(i)
else:
test_texts.append(content)
test_labels.append(i)
return train_texts, np.array(train_labels), test_texts, np.array(test_labels)
```
接下来,我们需要对文本进行预处理。这里我们使用jieba分词器进行分词,并使用文本向量化方法将文本转换为数字向量。
```python
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
# 对文本进行预处理
def preprocess(train_texts, train_labels, test_texts, test_labels):
# 分词
train_texts = [' '.join(jieba.cut(text)) for text in train_texts]
test_texts = [' '.join(jieba.cut(text)) for text in test_texts]
# 将文本转换为数字向量
vectorizer = TfidfVectorizer(max_features=5000)
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)
# 将标签转换为数字
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)
return train_features, train_labels, test_features, test_labels
```
现在,我们可以定义一个神经网络模型,并使用训练集进行训练。
```python
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
# 定义神经网络模型
def create_model(input_dim, output_dim):
model = Sequential()
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(output_dim, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# 训练神经网络模型
def train_model(train_features, train_labels, test_features, test_labels, epochs=20, batch_size=128):
input_dim = train_features.shape[1]
output_dim = np.max(train_labels) + 1
model = create_model(input_dim, output_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
model.fit(train_features, train_labels, epochs=epochs, batch_size=batch_size,
validation_data=(test_features, test_labels), callbacks=[early_stopping])
return model
```
最后,我们可以使用测试集对模型进行评估。
```python
from sklearn.metrics import accuracy_score
# 使用测试集对模型进行评估
def evaluate_model(model, test_features, test_labels):
y_pred = model.predict(test_features)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(test_labels, y_pred)
return accuracy
```
完整代码如下:
```python
import os
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
# 加载数据集
def load_data(data_path):
train_texts, train_labels = [], []
test_texts, test_labels = [], []
categories = os.listdir(data_path)
for i, category in enumerate(categories):
category_path = os.path.join(data_path, category)
file_names = os.listdir(category_path)
for file_name in file_names:
with open(os.path.join(category_path, file_name), 'r', encoding='utf-8') as f:
content = f.read()
if file_name.startswith('train'):
train_texts.append(content)
train_labels.append(i)
else:
test_texts.append(content)
test_labels.append(i)
return train_texts, np.array(train_labels), test_texts, np.array(test_labels)
# 对文本进行预处理
def preprocess(train_texts, train_labels, test_texts, test_labels):
# 分词
train_texts = [' '.join(jieba.cut(text)) for text in train_texts]
test_texts = [' '.join(jieba.cut(text)) for text in test_texts]
# 将文本转换为数字向量
vectorizer = TfidfVectorizer(max_features=5000)
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)
# 将标签转换为数字
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)
return train_features, train_labels, test_features, test_labels
# 定义神经网络模型
def create_model(input_dim, output_dim):
model = Sequential()
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(output_dim, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# 训练神经网络模型
def train_model(train_features, train_labels, test_features, test_labels, epochs=20, batch_size=128):
input_dim = train_features.shape[1]
output_dim = np.max(train_labels) + 1
model = create_model(input_dim, output_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
model.fit(train_features, train_labels, epochs=epochs, batch_size=batch_size,
validation_data=(test_features, test_labels), callbacks=[early_stopping])
return model
# 使用测试集对模型进行评估
def evaluate_model(model, test_features, test_labels):
y_pred = model.predict(test_features)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(test_labels, y_pred)
return accuracy
if __name__ == '__main__':
data_path = 'THUCNews' # 数据集路径
train_texts, train_labels, test_texts, test_labels = load_data(data_path)
train_features, train_labels, test_features, test_labels = preprocess(train_texts, train_labels, test_texts, test_labels)
model = train_model(train_features, train_labels, test_features, test_labels)
accuracy = evaluate_model(model, test_features, test_labels)
print(accuracy)
```
这里的神经网络模型仅作为示例。你可以根据实际情况来设计更加复杂的模型。
阅读全文