如何在jupyter notebook中定义word_dict
时间: 2023-11-12 22:03:52 浏览: 72
要在 Jupyter Notebook 中定义一个 word_dict 可以按照以下步骤进行:
1. 首先,创建一个空的字典,例如:
```python
word_dict = {}
```
2. 然后,向字典中添加键值对,例如:
```python
word_dict['apple'] = 1
word_dict['banana'] = 2
word_dict['orange'] = 3
```
这里,我们以单词为键,以数字为值,将它们添加到字典中。
3. 最后,你可以通过调用字典的键来访问其值,例如:
```python
print(word_dict['apple']) # 输出 1
print(word_dict['banana']) # 输出 2
print(word_dict['orange']) # 输出 3
```
这样,你就在 Jupyter Notebook 中定义了一个 word_dict,并向其中添加了一些键值对。
相关问题
在《西游记》全文中,统计不同频段的词汇分布情况:查询词汇频率在[0-100]、[100-1000]、[1000-5000]、[5000以上]等的分布情况。D:\xiyouji.txt 我需要完整的jupter代码。用codecs、nltk、Counter
为了完成这个任务,我们可以使用Python的`codecs`库来读取文本文件,`nltk`库进行自然语言处理,包括分词和停用词过滤,然后使用`collections.Counter`来计算单词频率。以下是一个简单的Jupyter Notebook代码示例:
```python
import os
import codecs
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
# 定义词汇频段范围
freq_bins = [[0, 100], [100, 1000], [1000, 5000], [5000, float('inf')]]
# 加载《西游记》文本
filename = "D:\\xiyouji.txt"
with codecs.open(filename, 'r', encoding='utf-8') as file:
text = file.read()
# 删除标点符号并转换为小写
text = text.translate(str.maketrans('', '', string.punctuation)).lower()
# 分词并移除停用词
stop_words = set(stopwords.words('chinese'))
tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if word not in stop_words]
# 计算每个频段的单词计数
bin_word_counts = []
for bin_range in freq_bins:
count_in_bin = Counter(filtered_tokens)[list(range(bin_range[0], bin_range[1]))]
bin_word_counts.append((bin_range, dict(count_in_bin)))
# 输出结果
for i, (bin, word_count) in enumerate(bin_word_counts):
print(f"频段{i+1}: {bin} - 词频统计:")
for word, freq in word_count.items():
print(f"{word}: {freq}")
# 相关问题--
写一个seq2seq的中文聊天机器人的python代码,用jupterbook能用的通用代码
好的,我给您提供一个基于Seq2Seq模型的中文聊天机器人的Python代码,使用了TensorFlow和Keras库。您可以将代码保存为Jupyter Notebook格式,在Jupyter Notebook中运行。
首先需要安装以下库:
- TensorFlow
- Keras
- Jieba(中文分词库)
以下是代码:
```python
# 导入必要的库
import numpy as np
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.callbacks import EarlyStopping
# 定义参数
BATCH_SIZE = 64 # Batch大小
EPOCHS = 100 # 训练轮数
LATENT_DIM = 256 # LSTM隐层单元数
NUM_SAMPLES = 10000 # 样本数量
MAX_SEQUENCE_LENGTH = 100 # 最长句子长度
MAX_NUM_WORDS = 20000 # 最大词汇量
EMBEDDING_DIM = 100 # 词嵌入维度
# 读取数据
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open('data/chatbot_data.txt', 'r', encoding='utf-8') as f:
lines = f.read().split('\n')
for line in lines[: min(NUM_SAMPLES, len(lines) - 1)]:
input_text, target_text = line.split('\t')
target_text = '\t' + target_text + '\n'
input_texts.append(input_text)
target_texts.append(target_text)
for char in input_text:
if char not in input_characters:
input_characters.add(char)
for char in target_text:
if char not in target_characters:
target_characters.add(char)
# 对输入和输出序列进行分词
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_word_index = input_tokenizer.word_index
target_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_word_index = target_tokenizer.word_index
# 翻转字典,用于解码器
reverse_input_word_index = dict((i, word) for word, i in input_word_index.items())
reverse_target_word_index = dict((i, word) for word, i in target_word_index.items())
# 对输入和输出序列进行填充
encoder_inputs = pad_sequences(input_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
decoder_inputs = pad_sequences(target_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
# 对输出序列进行one-hot编码
decoder_outputs = np.zeros((len(decoder_inputs), MAX_SEQUENCE_LENGTH, len(target_word_index) + 1), dtype='float32')
for i, target_sequence in enumerate(target_sequences):
for t, word in enumerate(target_sequence):
decoder_outputs[i, t, word] = 1.
# 定义编码器
encoder_inputs_placeholder = Input(shape=(MAX_SEQUENCE_LENGTH,))
encoder_embedding = Embedding(len(input_word_index) + 1, EMBEDDING_DIM)
encoder_inputs_embedded = encoder_embedding(encoder_inputs_placeholder)
encoder_lstm = LSTM(LATENT_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs_embedded)
encoder_states = [state_h, state_c]
# 定义解码器
decoder_inputs_placeholder = Input(shape=(MAX_SEQUENCE_LENGTH,))
decoder_embedding = Embedding(len(target_word_index) + 1, EMBEDDING_DIM)
decoder_inputs_embedded = decoder_embedding(decoder_inputs_placeholder)
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_embedded, initial_state=encoder_states)
decoder_dense = Dense(len(target_word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# 定义模型
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)
# 编译模型
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# 训练模型
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit([encoder_inputs, decoder_inputs], decoder_outputs, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2, callbacks=[early_stopping])
# 保存模型
model.save('model/chatbot_model.h5')
# 定义编码器、解码器模型
encoder_model = Model(encoder_inputs_placeholder, encoder_states)
decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_embedded = decoder_embedding(decoder_inputs_single)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs_single_embedded, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs_single] + decoder_states_inputs, [decoder_outputs] + decoder_states)
# 定义生成回复函数
def reply(input_text):
input_seq = input_tokenizer.texts_to_sequences([input_text])
input_seq = pad_sequences(input_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
states_value = encoder_model.predict(input_seq)
target_seq = np.zeros((1, 1))
target_seq[0, 0] = target_word_index['\t']
reply_text = ''
while True:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_word_index[sampled_token_index]
if sampled_char == '\n' or len(reply_text) > MAX_SEQUENCE_LENGTH:
break
reply_text += sampled_char
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
states_value = [h, c]
return reply_text
# 测试生成回复函数
input_text = '你好'
reply_text = reply(input_text)
print('Input text:', input_text)
print('Reply text:', reply_text)
```
注:为了训练一个好的聊天机器人,需要更多的数据和更复杂的模型,以上代码只是一个简单的演示。
阅读全文