''' ''' import re from urllib.request import urlopen def getPage(url): # 获取网页的字符串 response = urlopen(url) return response.read().decode('utf-8') def parsePage(s): ret = com.finditer(s) # 从s这个网页源码中,找到所有符合com正则表达式的内容,并且以迭代器的形式返回 for i in ret: yield{ 'id':i.group('id'), 'title':i.group('title'), 'rating_num':i.group('rating_num'), 'comment_num':i.group('comment_num') } def main(num): # 翻页,执行10次 url = 'https://movie.douban.com/top250?start=%s&filter='% num response_html = getPage(url) # response_html就是这个url对应的html代码,就是 str ret = parsePage(response_html) print(ret) f = open('move_info7','a',encoding='utf8') for obj in ret: print(obj) data = str(obj) f.write(data +'\n') f.close() com = re.compile( '<div class = "item">.*?<div class="pic">.*?<em.*?>(?P<id>\d+).*?<span class = "title">(?P<title>.*?)</span>' '.*?<span class="rating_num".*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>',re.S) count = 0 for i in range(10): main(count) count += 25 检查这段代码并且进行修改
时间: 2023-06-16 09:08:34 浏览: 96
这段代码是用来爬取豆瓣电影Top250的信息,并且将结果保存到文件中。但是,代码中存在一些错误和需要改进的地方,建议进行如下修改:
1. 将文件名由'move_info7'改为'movie_info.txt',并且将文件打开操作放在for循环外面,避免重复打开和关闭文件。
2. 将函数名从'getPage'改为'get_page',符合Python的命名规范。
3. 将参数名从's'改为'html',更加直观。
4. 将正则表达式中的空格去掉,避免匹配不到结果。
5. 在文件写入操作之前,需要先检查文件是否存在,如果不存在,则需要创建文件。
修改后的代码如下:
```python
import re
from urllib.request import urlopen
def get_page(url):
# 获取网页的字符串
response = urlopen(url)
return response.read().decode('utf-8')
def parse_page(html):
com = re.compile(
'<div class="item">.*?<div class="pic">.*?<em.*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
'.*?<span class="rating_num".*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S)
ret = com.finditer(html) # 从html这个网页源码中,找到所有符合com正则表达式的内容,并且以迭代器的形式返回
for i in ret:
yield {
'id': i.group('id'),
'title': i.group('title'),
'rating_num': i.group('rating_num'),
'comment_num': i.group('comment_num')
}
def main(num):
# 翻页,执行10次
url = 'https://movie.douban.com/top250?start=%s&filter=' % num
html = get_page(url) # html就是这个url对应的html代码,就是 str
ret = parse_page(html)
print(ret)
with open('movie_info.txt', 'a', encoding='utf8') as f:
for obj in ret:
print(obj)
data = str(obj)
f.write(data + '\n')
if __name__ == '__main__':
count = 0
for i in range(10):
main(count)
count += 25
```
阅读全文
相关推荐
![pdf](https://img-home.csdnimg.cn/images/20241231044930.png)
![pdf](https://img-home.csdnimg.cn/images/20241231044930.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
运行下面代码,运行结果没有保存文件,请帮我找出原因 # -- coding: utf-8 -- import urllib.request import re def getNovertContent(): url = 'http://www.quannovel.com/read/640/' req = urllib.request.Request(url) req.add_header('User-Agent', ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36') data = urllib.request.urlopen(req).read().decode('gbk') str1 = str(data) # 将网页数据转换为字符串 reg = r'(.?)' reg = re.compile(reg) urls = reg.findall(str1) for url in urls: novel_url = url[0] novel_title = url[1] chapt = urllib.request.urlopen(novel_url).read() chapt_html = chapt.decode('gbk') reg = r'</script> (.?)</script type="text/javascript">' reg = re.compile(reg, re.S) chapt_content = reg.findall(chapt_html) chapt_content = chapt_content[0].replace( " ", "") chapt_content = chapt_content.replace("
", "") print("正在保存 %s" % novel_title) with open("{}.txt".format(novel_title), 'w', encoding='utf-8') as f: f.write(chapt_content) getNovertContent()
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
运行下面代码,运行结果没有保存文件,请帮我找出原因 # -- coding: utf-8 -- # 指定文件编码格式为utf-8 import urllib.request import re def getNovertContent(): url = 'http://www.quannovel.com/read/640/' req = urllib.request.Request(url) req.add_header( 'User-Agent', ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36') data = urllib.request.urlopen(req).read().decode('gbk') str1 = str(data) # 将网页数据转换为字符串 reg = r'(.?)' reg = re.compile(reg) urls = reg.findall(str1) for url in urls: novel_url = url[0] novel_title = url[1] chapt = urllib.request.urlopen(novel_url).read() chapt_html = chapt.decode('gbk') reg = r'</script> (.?)</script> type="text/javascript">' reg = re.compile(reg, re.S) chapt_content = reg.findall(reg, chapt_html) chapt_content = chapt_content[0].replace( " ", "") chapt_content = chapt_content.replace("
", "") print("正在保存 %s" % novel_title) with open("{}.txt".format(novel_title), 'w') as f: f.write(chapt_content) getNovertContent()
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
注释以下代码from re import findall from urllib.parse import urljoin from urllib.request import urlopen, Request url = r'http://jwc.sdtbu.edu.cn/info/2002/5418.htm' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36', 'Referer': url, } # 不加这一项会有防盗链提示 req = Request(url=url, headers=headers) with urlopen(req) as fp: content = fp.read().decode() pattern = r'(.+?)' for fileUrl, fileName in findall(pattern, content): if 'javascript' in fileUrl: continue fileUrl = urljoin(url, fileUrl) req = Request(url=fileUrl, headers=headers) with urlopen(req) as fp1: with open(fileName, 'wb') as fp2: fp2.write(fp1.read())
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![zip](https://img-home.csdnimg.cn/images/20241231045053.png)
![zip](https://img-home.csdnimg.cn/images/20241231045053.png)