对新闻语句“深航客机攀枝花机场遇险:机腹轮胎均疑受损,跑道灯部分损坏”使用HMM进行中文分词 步骤: 1.定义train函数,用于将初始概率、转移概率和发射概率写入JSON文件中(10分) 2.定义viterbi函数,用于实现维特比算法(10分) 3.定义cut函数实现分词(10分)
时间: 2023-05-27 15:02:10 浏览: 125
1.定义train函数,用于将初始概率、转移概率和发射概率写入JSON文件中(10分)
import json
def train(text):
# 初始化初始状态、转移概率和发射概率
pi = {}
A = {}
B = {}
# 统计初始状态出现的次数
for word in text:
if word[0] not in pi:
pi[word[0]] = 1
else:
pi[word[0]] += 1
# 统计转移概率出现的次数和发射概率出现的次数
for i in range(len(text)):
if i == len(text) - 1:
break
if text[i][0] not in A:
A[text[i][0]] = {}
if text[i + 1][0] not in A[text[i][0]]:
A[text[i][0]][text[i + 1][0]] = 1
else:
A[text[i][0]][text[i + 1][0]] += 1
if text[i][0] not in B:
B[text[i][0]] = {}
if text[i][1] not in B[text[i][0]]:
B[text[i][0]][text[i][1]] = 1
else:
B[text[i][0]][text[i][1]] += 1
# 将出现的次数计算为概率
for key in pi:
pi[key] /= len(text)
for key1 in A:
for key2 in A[key1]:
A[key1][key2] /= sum(A[key1].values())
for key1 in B:
for key2 in B[key1]:
B[key1][key2] /= sum(B[key1].values())
# 将训练得到的结果写入JSON文件
with open("hmm_params.json", "w") as f:
json.dump({"pi": pi, "A": A, "B": B}, f)
text = [("深航", "nz"), ("客机", "n"), ("攀枝花", "ns"), ("机场", "n"), ("遇险", "v"),
("机腹", "n"), ("轮胎", "n"), ("均", "d"), ("疑", "v"), ("受损", "v"),
("跑道", "n"), ("灯", "n"), ("部分", "m"), ("损坏", "v")]
train(text)
2.定义viterbi函数,用于实现维特比算法(10分)
import json
def viterbi(obs, states):
# 加载训练得到的参数
with open("hmm_params.json") as f:
params = json.load(f)
pi = params["pi"]
A = params["A"]
B = params["B"]
# 初始化
V = [{}]
path = {}
for state in states:
V[0][state] = pi[state] * B[state].get(obs[0], 0)
path[state] = [state]
# 递推
for t in range(1, len(obs)):
V.append({})
new_path = {}
for state1 in states:
(prob, previous_state) = max(
[(V[t - 1][state2] * A[state2].get(state1, 0) * B[state1].get(obs[t], 0), state2)
for state2 in states if V[t - 1][state2] > 0])
V[t][state1] = prob
new_path[state1] = path[previous_state] + [state1]
path = new_path
# 终止
(prob, state) = max((V[len(obs) - 1][state], state) for state in states)
return path[state]
states = ["nz", "n", "ns", "v", "d", "m"]
words = "深航客机攀枝花机场遇险机腹轮胎均疑受损跑道灯部分损坏"
obs = [words[i:i + 2] for i in range(0, len(words), 2)]
result = viterbi(obs, states)
print(result)
3.定义cut函数实现分词(10分)
import json
def cut(text):
# 加载训练得到的参数
with open("hmm_params.json") as f:
params = json.load(f)
pi = params["pi"]
A = params["A"]
B = params["B"]
# 对输入文本进行分词
obs = [text[i:i + 2] for i in range(0, len(text), 2)]
states = ["nz", "n", "ns", "v", "d", "m"]
result = viterbi(obs, states)
# 输出分词结果
words = []
for i in range(len(result) - 1):
if result[i].startswith("n") and result[i + 1] == "n":
words.append(obs[i] + obs[i + 1])
else:
words.append(obs[i])
words.append(obs[-1])
return words
text = "深航客机攀枝花机场遇险机腹轮胎均疑受损跑道灯部分损坏"
print(cut(text))
阅读全文