tensorflow实现中文情感分析从获取数据集、预处理、构建模型、训练模型和测试模型的代码
时间: 2023-08-07 08:02:55 浏览: 43
获取数据集:
```
import pandas as pd
# 读取csv文件
df = pd.read_csv('data.csv', encoding='utf-8')
# 选择需要的列
df = df[['text', 'label']]
# 将标签转换为数字
df['label'] = df['label'].map({'positive': 1, 'negative': 0})
```
预处理:
```
import jieba
# 分词
def cut_text(text):
return ' '.join(jieba.cut(text))
# 对文本进行分词
df['text'] = df['text'].apply(cut_text)
```
构建模型:
```
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
class SentimentModel(tf.keras.Model):
def __init__(self):
super().__init__()
self.bert = TFBertModel.from_pretrained('bert-base-chinese')
self.dropout = tf.keras.layers.Dropout(0.3)
self.dense = tf.keras.layers.Dense(2, activation='softmax')
def call(self, inputs):
output = self.bert(inputs)[1]
output = self.dropout(output)
output = self.dense(output)
return output
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = SentimentModel()
```
训练模型:
```
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
class SentimentDataset(tf.keras.utils.Sequence):
def __init__(self, df, tokenizer, batch_size):
self.df = df
self.tokenizer = tokenizer
self.batch_size = batch_size
def __len__(self):
return len(self.df) // self.batch_size
def __getitem__(self, idx):
texts = self.df.iloc[idx*self.batch_size:(idx+1)*self.batch_size]['text'].values
labels = self.df.iloc[idx*self.batch_size:(idx+1)*self.batch_size]['label'].values
inputs = self.tokenizer.batch_encode_plus(
texts,
add_special_tokens=True,
max_length=256,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_token_type_ids=False,
return_tensors='tf'
)
return inputs['input_ids'], inputs['attention_mask'], labels
train_dataset = SentimentDataset(df_train, tokenizer, batch_size=32)
optimizer = Adam(lr=1e-5)
loss_fn = SparseCategoricalCrossentropy()
for epoch in range(10):
for i in range(len(train_dataset)):
input_ids, attention_mask, labels = train_dataset[i]
with tf.GradientTape() as tape:
logits = model([input_ids, attention_mask])
loss = loss_fn(labels, logits)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
if i % 100 == 0:
print('Epoch: {}/{} | Batch: {}/{} | Loss: {:.4f}'.format(
epoch+1, 10, i+1, len(train_dataset), loss.numpy()))
```
测试模型:
```
def predict(model, text):
input_ids = tokenizer.encode(text, add_special_tokens=True, max_length=256, padding='max_length', truncation=True, return_tensors='tf')
logits = model([input_ids, tf.ones_like(input_ids)])
probs = tf.nn.softmax(logits, axis=-1).numpy()[0]
return probs
text = '这部电影真的很好看,值得一看!'
probs = predict(model, text)
print('Positive probability:', probs[1])
```