运行下面代码,运行结果没有保存文件,请帮我找出原因 # -- coding: utf-8 -- # 指定文件编码格式为utf-8 import urllib.request import re def getNovertContent(): url = 'http://www.quannovel.com/read/640/' req = urllib.request.Request(url) req.add_header( 'User-Agent', ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36') data = urllib.request.urlopen(req).read().decode('gbk') str1 = str(data) # 将网页数据转换为字符串 reg = r'<li><a href="(.?)" title=".?">(.?)</a><li>' reg = re.compile(reg) urls = reg.findall(str1) for url in urls: novel_url = url[0] novel_title = url[1] chapt = urllib.request.urlopen(novel_url).read() chapt_html = chapt.decode('gbk') reg = r'</script> (.?)</script> type="text/javascript">' reg = re.compile(reg, re.S) chapt_content = reg.findall(reg, chapt_html) chapt_content = chapt_content[0].replace( " ", "") chapt_content = chapt_content.replace("<br />", "") print("正在保存 %s" % novel_title) with open("{}.txt".format(novel_title), 'w') as f: f.write(chapt_content) getNovertContent()
时间: 2024-02-29 11:54:18 浏览: 81
python按行读取文件并找出其中指定字符串
代码中存在两个问题导致没有保存文件:
1. 在正则表达式匹配章节内容时,应该调用 `reg.findall(chapt_html)` 而不是 `reg.findall(reg, chapt_html)`。
2. 在打开文件时,需要指定文件的编码格式为 utf-8,因为在代码开头已经指定了文件编码格式为 utf-8。可以将 `with open("{}.txt".format(novel_title), 'w') as f:` 改为 `with open("{}.txt".format(novel_title), 'w', encoding='utf-8') as f:`。
修改后的代码如下:
```python
# -- coding: utf-8 --
# 指定文件编码格式为utf-8
import urllib.request
import re
def getNovertContent():
url = 'http://www.quannovel.com/read/640/'
req = urllib.request.Request(url)
req.add_header('User-Agent', ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36')
data = urllib.request.urlopen(req).read().decode('gbk')
str1 = str(data) # 将网页数据转换为字符串
reg = r'<li><a href="(.+?)" title=".+?">(.+?)</a><li>'
reg = re.compile(reg)
urls = reg.findall(str1)
for url in urls:
novel_url = url[0]
novel_title = url[1]
chapt = urllib.request.urlopen(novel_url).read()
chapt_html = chapt.decode('gbk')
reg = r'</script> (.+?)</script> type="text/javascript">'
reg = re.compile(reg, re.S)
chapt_content = reg.findall(chapt_html)
chapt_content = chapt_content[0].replace(" ", "")
chapt_content = chapt_content.replace("<br />", "")
print("正在保存 %s" % novel_title)
with open("{}.txt".format(novel_title), 'w', encoding='utf-8') as f:
f.write(chapt_content)
getNovertContent()
```
阅读全文