python代码
时间: 2023-07-02 12:23:24 浏览: 88
以下是基于CNN模型的THUCNews数据集文本分类的Python代码示例:
```python
import numpy as np
import pandas as pd
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.models import Sequential
# 加载THUCNews数据集
df = pd.read_csv('THUCNews.csv', encoding='utf-8')
df = df.sample(frac=1).reset_index(drop=True) # 打乱数据集
# 分词和向量化
max_words = 50000 # 设置词汇表大小
max_len = 600 # 设置每个文本的最大长度
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=max_len)
# 划分训练集和测试集
labels = df['label'].values
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(0.2 * data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_test = data[-num_validation_samples:]
y_test = labels[-num_validation_samples:]
# 构建模型
embedding_dim = 100 # 设置词向量维度
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
# 训练模型
num_epochs = 10
batch_size = 128
model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(x_test, y_test))
# 评估模型
scores = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
```
注意:上述代码仅供参考,具体实现方式可能会因数据集、模型和实验设计的不同而有所差异。
阅读全文