基于word2vec的文本分类 完整代码
时间: 2024-01-11 13:21:52 浏览: 88
基于word2vec的文本分类的完整代码如下所示:
```python
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
# 1. 准备数据
sentences = [['I', 'love', 'this', 'movie'],
['This', 'is', 'an', 'awesome', 'movie'],
['Great', 'movie', 'ever'],
['I', 'like', 'it'],
['This', 'is', 'terrible']]
labels = ['positive', 'positive', 'positive', 'positive', 'negative']
# 2. 训练Word2Vec模型
model = Word2Vec(sentences, min_count=1)
# 3. 获取词向量
word_vectors = model.wv
# 4. 将文本转换为向量表示
X = []
for sentence in sentences:
vector = np.zeros(100) # 假设词向量维度为100
for word in sentence:
vector += word_vectors[word]
X.append(vector)
# 5. 标签编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
# 6. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 7. 训练分类器
classifier = SVC()
classifier.fit(X_train, y_train)
# 8. 预测
y_pred = classifier.predict(X_test)
# 9. 输出预测结果
for i in range(len(X_test)):
print("Text:", sentences[i])
print("True Label:", labels[i])
print("Predicted Label:", label_encoder.inverse_transform([y_pred[i]]))
print()
```
阅读全文