注释以下代码from re import findall from urllib.parse import urljoin from urllib.request import urlopen, Request url = r'http://jwc.sdtbu.edu.cn/info/2002/5418.htm' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36', 'Referer': url, } # 不加这一项会有防盗链提示 req = Request(url=url, headers=headers) with urlopen(req) as fp: content = fp.read().decode() pattern = r'<a href="(.+?)"><span>(.+?)</span>' for fileUrl, fileName in findall(pattern, content): if 'javascript' in fileUrl: continue fileUrl = urljoin(url, fileUrl) req = Request(url=fileUrl, headers=headers) with urlopen(req) as fp1: with open(fileName, 'wb') as fp2: fp2.write(fp1.read())
时间: 2024-03-29 22:36:04 浏览: 105
以下是对代码的注释:
```python
# 导入所需的模块
from re import findall
from urllib.parse import urljoin
from urllib.request import urlopen, Request
# 要爬取的网页地址
url = r'http://jwc.sdtbu.edu.cn/info/2002/5418.htm'
# 设置请求头,模拟浏览器访问,避免反爬虫机制
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
'Referer': url, # 不加这一项会有防盗链提示
}
# 构造请求对象
req = Request(url=url, headers=headers)
# 发送请求,获取网页内容
with urlopen(req) as fp:
content = fp.read().decode()
# 匹配网页中的文件链接和文件名
pattern = r'<a href="(.+?)"><span>(.+?)</span>'
for fileUrl, fileName in findall(pattern, content):
# 排除非文件链接(如javascript链接)
if 'javascript' in fileUrl:
continue
# 拼接文件链接的完整地址
fileUrl = urljoin(url, fileUrl)
# 构造请求对象
req = Request(url=fileUrl, headers=headers)
# 发送请求,获取文件内容,并保存到本地
with urlopen(req) as fp1:
with open(fileName, 'wb') as fp2:
fp2.write(fp1.read())
```
代码的作用是从指定网页中爬取文件链接,并将文件下载到本地。其中,使用了正则表达式对网页内容进行匹配,并使用 urllib 库发送请求和获取响应。在请求头中设置了 User-Agent 和 Referer 字段,模拟浏览器访问,避免被反爬虫机制拦截。
阅读全文