stopwords = stopwordslist() stopwords.append('|')
时间: 2023-07-26 14:04:08 浏览: 41
这不是一个问题,而是一段代码。这段代码的作用是调用一个函数获取停用词表,并在该表中添加一个竖线符号'|'。停用词表是自然语言处理中常用的一种文本预处理技术,它通常包含一些常用的无意义单词,如'the', 'a', 'an'等等,这些单词在文本分析中通常被忽略。在这段代码中,'|'被添加到停用词表中可能是为了在文本处理中忽略竖线符号的影响。
相关问题
对于以上问题,这段代码应该怎样改进# 去停用词 def deleteStop(sentence): stopwords = stopwordslist() outstr = '' for i in sentence: if i not in stopwords and i != '\n': outstr += i return outstr def wordCut(Review): Mat = [] for rec in Review: seten = [] rec = re.sub('[%s]' % re.escape(string.punctuation), '', rec) fenci = jieba.lcut(rec) # 精准模式分词 stc = deleteStop(fenci) # 去停用词 seg_liat = pseg.cut(stc) # 标注词性 for word, flag in seg_list: if flag not in['nr', 'ns', 'nt', 'nz', 'm', 'f', 'ul', 'l', 'r', 't']: seten.append(word) Mat.append(seten) return Mat trainCut = wordCut(trainReview) testCut = wordCut(testReview) wordCut = trainCut + testCut
这段代码可以改进的地方有:
1. 停用词表的获取方式可以优化,可以使用更全面的停用词表或者根据具体数据集构建自定义停用词表。
2. 分词方法可以考虑使用更加先进的分词工具,如jieba的新模式或其他分词工具。
3. 去除标点符号的方法可以优化,可以考虑使用正则表达式或其他方法实现。
4. 标注词性的方法可以优化,可以考虑使用更加准确的词性标注工具。
5. 代码风格可以规范化,如命名规范、缩进规范等。
6. 变量名可以更加具有描述性,方便代码的阅读和理解。
7. 可以考虑将训练集和测试集的分词结果分别存储,以便后续使用。
优化这段代码:import requests from bs4 import BeautifulSoup import jieba url = "http://xc.hfut.edu.cn/1955/list{}.htm" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"} news_list = [] for i in range(1, 6): # 爬取前5页的新闻标题 res = requests.get(url.format(i), headers=headers) soup = BeautifulSoup(res.text, "html.parser") news = soup.find_all("span", {"class": "news_title"}) for n in news: news_list.append(n.a.string) # 对新闻标题进行分词 words_list = [] for news in news_list: words = jieba.cut(news) for word in words: words_list.append(word) from wordcloud import WordCloud import matplotlib.pyplot as plt from PIL import Image import numpy as np # 读入背景图片 image = Image.open("C:\\xhktSoft\huahua.jpg") graph = np.array(image) # 设置停用词 stop_words = ["的", "是", "在", "了", "和", "与", "也", "还", "有", "就", "等", "中", "及", "对", "是"] # 生成词云图 wc = WordCloud(font_path="msyh.ttc", background_color='white', max_words=200, mask=graph, stopwords=stop_words, max_font_size=200, random_state=42) wc.generate_from_text(" ".join(words_list)) # 绘制词云图 plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show()
import requests
from bs4 import BeautifulSoup
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
# 定义函数获取新闻标题
def get_news_titles(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
news_list = []
for i in range(1, 6):
res = requests.get(url.format(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
news = soup.find_all("span", {"class": "news_title"})
for n in news:
news_list.append(n.a.string)
return news_list
# 定义函数对新闻标题进行分词
def cut_words(news_list):
words_list = []
for news in news_list:
words = jieba.cut(news)
for word in words:
words_list.append(word)
return words_list
# 定义函数生成词云图
def generate_wordcloud(words_list, graph):
stop_words = ["的", "是", "在", "了", "和", "与", "也", "还", "有", "就", "等", "中", "及", "对", "是"]
wc = WordCloud(font_path="msyh.ttc", background_color='white', max_words=200, mask=graph, stopwords=stop_words, max_font_size=200, random_state=42)
wc.generate_from_text(" ".join(words_list))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
# 主函数
if __name__ == '__main__':
url = "http://xc.hfut.edu.cn/1955/list{}.htm"
news_list = get_news_titles(url)
words_list = cut_words(news_list)
graph = np.array(Image.open("C:\\xhktSoft\huahua.jpg"))
generate_wordcloud(words_list, graph)