def dataProsessing(filename): file = open(filename, encoding='UTF-8') contents = file.readlines() file.close() # 步骤1:分词 classVec = [] contentList = [] label0tip = [] label1tip = [] label2tip = [] label3tip = [] Numberset = [0, 0, 0, 0] # 过滤有非中文字符 for line in contents: classVec.append(int(line[0])) contentstr = "".join(re.findall(u'[\u4e00-\u9fa5]+', line[2:])) content = (" ".join(jieba.cut(contentstr))).strip('\n').split(' ') stop = [row.strip() for row in open('StopWord.txt', 'r', encoding='utf-8').readlines()] content = [item for item in content if len(item) > 1] content = [item for item in content if item not in stop] contentList.append(content) if line[0] == '0': Numberset[0] += 1 for item in content: label0tip.append(item) elif line[0] == '1': Numberset[1] += 1 for item in content: label1tip.append(item) elif line[0] == '2': Numberset[2] += 1 for item in content: label2tip.append(item) elif line[0] == '3': Numberset[3] += 1 for item in content: label3tip.append(item) return contentList, classVec, label0tip, label1tip, label2tip, label3tip, Numberset
时间: 2024-02-14 11:07:57 浏览: 24
好的,这段代码可以进行如下优化:
1. 将文件读取和关闭操作放在 with 语句中,可以自动关闭文件。
2. 可以使用列表推导式来简化代码。
3. 可以使用 defaultdict 来统计每个标签对应的单词。
下面是优化后的代码:
```
import re
import jieba
from collections import defaultdict
def data_processing(filename):
content_list = []
class_vec = []
label_tip = defaultdict(list)
numberset = [0, 0, 0, 0]
stop = [row.strip() for row in open('StopWord.txt', 'r', encoding='utf-8').readlines()]
with open(filename, encoding='UTF-8') as f:
for line in f:
class_vec.append(int(line[0]))
content_str = "".join(re.findall(u'[\u4e00-\u9fa5]+', line[2:]))
content = (" ".join(jieba.cut(content_str))).strip('\n').split(' ')
content = [item for item in content if len(item) > 1 and item not in stop]
content_list.append(content)
label_tip[line[0]].extend(content)
numberset[int(line[0])] += 1
label0_tip = label_tip['0']
label1_tip = label_tip['1']
label2_tip = label_tip['2']
label3_tip = label_tip['3']
return content_list, class_vec, label0_tip, label1_tip, label2_tip, label3_tip, numberset
```
优化后的代码使用了更加简洁的变量名,更符合 Python 的编码规范。同时,通过使用 defaultdict,可以避免手动创建各个标签对应的列表,代码更加简洁。