icwb2 msr 基于规则的中文分词 python代码
时间: 2023-11-12 09:32:48 浏览: 28
以下是基于规则的中文分词 Python 代码示例,使用的是 icwb2 和 MSR 数据集:
```python
import os
# 加载字典
def load_dict(dict_path):
words = set()
with open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
word = line.strip()
words.add(word)
return words
# 正向最大匹配
def forward_max_match(text, words):
result = []
while text:
for i in range(len(text), 0, -1):
word = text[:i]
if word in words:
result.append(word)
text = text[i:]
break
else:
result.append(text[0])
text = text[1:]
return result
# 逆向最大匹配
def backward_max_match(text, words):
result = []
while text:
for i in range(len(text)):
word = text[i:]
if word in words:
result.insert(0, word)
text = text[:i]
break
else:
result.insert(0, text[-1])
text = text[:-1]
return result
# 双向最大匹配
def bidirectional_max_match(text, words):
forward_result = forward_max_match(text, words)
backward_result = backward_max_match(text, words)
if len(forward_result) < len(backward_result):
return forward_result
elif len(forward_result) > len(backward_result):
return backward_result
else:
forward_word_len = sum(len(word) for word in forward_result)
backward_word_len = sum(len(word) for word in backward_result)
if forward_word_len <= backward_word_len:
return forward_result
else:
return backward_result
if __name__ == '__main__':
# 加载字典
dict_path = os.path.join(os.getcwd(), 'dict.txt')
words = load_dict(dict_path)
# 测试文本
text = '今天天气很好,我们一起去外面玩吧。'
# 正向最大匹配
forward_result = forward_max_match(text, words)
print('正向最大匹配:', forward_result)
# 逆向最大匹配
backward_result = backward_max_match(text, words)
print('逆向最大匹配:', backward_result)
# 双向最大匹配
bidirectional_result = bidirectional_max_match(text, words)
print('双向最大匹配:', bidirectional_result)
```
其中,`load_dict` 函数用于加载字典,`forward_max_match` 函数用于正向最大匹配,`backward_max_match` 函数用于逆向最大匹配,`bidirectional_max_match` 函数用于双向最大匹配。这里使用的字典文件是 `dict.txt`,可根据实际情况替换成其他字典文件。