keras.layer.text.Tokenizer里面的参数含义
时间: 2024-05-04 17:17:47 浏览: 9
Keras中的`Tokenizer`类是用于将原始文本转换为数字列表的工具。它的主要参数如下:
- `num_words`: 整数或`None`。处理的最大单词数量。只有最常见的`num_words`个单词才会被保留。
- `filters`: 字符串,表示过滤器的列表,其中的字符将被过滤掉。默认值为`!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`。可以根据需要进行修改。
- `lower`: 布尔值,表示是否将文本转换为小写。默认为`True`。
- `split`: 字符串,表示分隔符。默认为`' '`.
- `char_level`: 布尔值,表示是否将每个字符视为一个标记。默认为`False`。
- `oov_token`: 字符串或`None`,表示当单词不在词汇表中时用于替代的标记。默认为`None`。
例如,如果想要将文本转换为数字列表,并且只保留前1000个最常见的单词,则可以使用以下代码:
```
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
```
相关问题
使用LSTM模型对微博文本weibo_senti_900.csv进行情感分类的完整代码
好的,以下是使用LSTM模型对微博文本进行情感分类的完整代码:
```python
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import LSTM, Embedding
from keras.models import Model
from sklearn.model_selection import train_test_split
# 读取数据
df = pd.read_csv('weibo_senti_900.csv')
# 将文本转化为数字序列
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.asarray(df['label'])
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
# 构建模型
embedding_layer = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(128, return_sequences=True)(embedded_sequences)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 训练模型
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=128)
# 评估模型
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', accuracy)
```
注意,这段代码中使用了Keras来构建LSTM模型,需要安装Keras库。另外,需要将数据文件weibo_senti_900.csv放在脚本所在的目录下才能正确读取数据。
tensorflow框架下BERT实体关系联合抽取Python代码,bert4keras
以下是使用bert4keras实现BERT实体关系联合抽取的Python代码示例:
```python
import json
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
from bert4keras.tokenizers import Tokenizer
from keras.layers import Dense
from keras.models import Model
# 模型参数
maxlen = 128
epochs = 10
batch_size = 16
learning_rate = 2e-5
categories = ["疾病和诊断", "影像检查", "实验室检验", "药物"]
num_classes = len(categories)
# BERT配置
config_path = '/path/to/bert_config.json'
checkpoint_path = '/path/to/bert_model.ckpt'
dict_path = '/path/to/vocab.txt'
# 加载数据
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
d = {'text': l['text'], 'spo_list': []}
for spo in l['spo_list']:
for o in spo['object']:
d['spo_list'].append((spo['subject'], spo['predicate'], o))
D.append(d)
return D
# 加载数据集
train_data = load_data('/path/to/train_data.json')
valid_data = load_data('/path/to/valid_data.json')
test_data = load_data('/path/to/test_data.json')
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator:
"""数据生成器
"""
def __init__(self, data, batch_size=32, shuffle=True):
self.data = data
self.batch_size = batch_size
self.shuffle = shuffle
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = list(range(len(self.data)))
if self.shuffle:
np.random.shuffle(idxs)
X1, X2, S, Y = [], [], [], []
for i in idxs:
d = self.data[i]
text = d['text'][:maxlen]
x1, x2 = tokenizer.encode(text)
s = np.zeros(len(text))
for spo in d['spo_list']:
subject = spo[0][:maxlen]
object = spo[2][:maxlen]
start = text.find(subject)
if start != -1:
end = start + len(subject) - 1
s[start:end+1] = 1
# 构建标注数据
predicate = spo[1]
y = np.zeros(num_classes)
y[categories.index(predicate)] = 1
X1.append(x1)
X2.append(x2)
S.append(s)
Y.append(y)
if len(X1) == 0:
continue
X1 = keras.preprocessing.sequence.pad_sequences(X1, maxlen=maxlen)
X2 = keras.preprocessing.sequence.pad_sequences(X2, maxlen=maxlen)
S = keras.preprocessing.sequence.pad_sequences(S, maxlen=maxlen)
Y = np.array(Y)
yield [X1, X2, S], Y
# 构建模型
bert_model = build_transformer_model(
config_path,
checkpoint_path,
model='bert',
return_keras_model=False,
)
output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_model.num_hidden_layers - 1)
output = bert_model.get_layer(output_layer).output
output = Dense(num_classes, activation='sigmoid')(output)
model = Model(bert_model.input, output)
model.summary()
# 损失函数
class MultiLoss(Loss):
"""多任务学习的损失函数
"""
def compute_loss(self, inputs, mask=None):
y_true, y_pred = inputs
y_true = K.cast(y_true, y_pred.dtype)
loss = K.binary_crossentropy(y_true, y_pred)
return loss
loss = MultiLoss().compute_loss
# 优化器
optimizer = Adam(learning_rate)
# 编译模型
model.compile(loss=loss, optimizer=optimizer)
# 训练模型
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(valid_data, batch_size)
test_generator = data_generator(test_data, batch_size)
model.fit_generator(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
validation_data=valid_generator.forfit(),
validation_steps=len(valid_generator)
)
# 评估模型
model.evaluate_generator(test_generator.forfit(), steps=len(test_generator))
# 保存模型
model.save_weights('/path/to/model.weights')
```