帮我用Python代码写一个可以批量下载人民日报的PDF文件并且统计其中的词语重复率的语句代码
时间: 2023-05-13 12:06:09 浏览: 72
以下是可以批量下载人民日报的PDF文件并统计其中词语重复率的Python代码:
```
import requests
from bs4 import BeautifulSoup
import os
import PyPDF2
import re
# 获取人民日报的PDF下载链接
def get_pdf_links():
url = 'http://paper.people.com.cn/rmrb/html/{}/nbs.D110000renmrb_01.htm'
pdf_links = []
for i in range(1, 6):
date = '2022-01/0{}'.format(str(i))
res = requests.get(url.format(date))
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('div.right_title-list ul li a')
for link in links:
if link.text.endswith('版PDF'):
pdf_links.append(link['href'])
return pdf_links
# 下载PDF文件
def download_pdf(url, path):
res = requests.get(url)
with open(path, 'wb') as f:
f.write(res.content)
# 统计词语重复率
def count_word_frequency(pdf_path, word):
with open(pdf_path, 'rb') as f:
pdf_reader = PyPDF2.PdfFileReader(f)
text = ''
for i in range(pdf_reader.getNumPages()):
text += pdf_reader.getPage(i).extractText()
text = re.sub(r'\s+', ' ', text)
total_words = len(text.split())
word_count = text.count(word)
frequency = word_count / total_words
return frequency
if __name__ == '__main__':
pdf_links = get_pdf_links()
for link in pdf_links:
filename = link.split('/')[-1]
path = os.path.join('pdfs', filename)
download_pdf(link, path)
frequency = count_word_frequency(path, '中国')
print('{} 中词语重复率:{}'.format(filename, frequency))
```
请注意,这段代码仅供参考,可能需要根据实际情况进行修改和调整。