深度学习实现中文情感分析从获取数据集、预处理、构建模型、训练模型和测试模型的代码
时间: 2023-08-07 08:02:55 浏览: 143
人工智能+深度学习+模型训练数据集+安全员/监护员标识+2/2
获取数据集:
```
import pandas as pd
# 读取csv文件
df = pd.read_csv('data.csv', encoding='utf-8')
# 选择需要的列
df = df[['text', 'label']]
# 将标签转换为数字
df['label'] = df['label'].map({'positive': 1, 'negative': 0})
```
预处理:
```
import jieba
# 分词
def cut_text(text):
return ' '.join(jieba.cut(text))
# 对文本进行分词
df['text'] = df['text'].apply(cut_text)
```
构建模型:
```
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
class SentimentModel(nn.Module):
def __init__(self):
super().__init__()
self.bert = BertModel.from_pretrained('bert-base-chinese')
self.fc = nn.Linear(768, 2)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
_, pooled = self.bert(x)
out = self.fc(pooled)
out = self.softmax(out)
return out
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = SentimentModel()
```
训练模型:
```
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
class SentimentDataset(Dataset):
def __init__(self, df, tokenizer):
self.df = df
self.tokenizer = tokenizer
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
text = self.df.iloc[idx]['text']
label = self.df.iloc[idx]['label']
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=256,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
return inputs['input_ids'], inputs['attention_mask'], torch.tensor(label)
def train(model, train_loader):
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
model.train()
for epoch in range(10):
for i, (input_ids, attention_mask, label) in enumerate(train_loader):
optimizer.zero_grad()
logits = model(input_ids=input_ids, attention_mask=attention_mask)
loss = criterion(logits, label)
loss.backward()
optimizer.step()
if i % 100 == 0:
print('Epoch: {}/{} | Batch: {}/{} | Loss: {:.4f}'.format(
epoch+1, 10, i+1, len(train_loader), loss.item()))
train_dataset = SentimentDataset(df_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
train(model, train_loader)
```
测试模型:
```
def predict(model, text):
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=256,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
probs = nn.Softmax(dim=1)(logits)
return probs.detach().numpy()[0]
text = '这部电影真的很好看,值得一看!'
probs = predict(model, text)
print('Positive probability:', probs[1])
```
阅读全文