import urllib.request as urllib2from bs4 import BeautifulSoup as bsfrom urllib.error import URLError, HTTPError, ContentTooShortErrora = []def download(url): print("download..."+url) head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"} request = urllib2.Request(url, headers=head) try: response = urllib2.urlopen(request) except(URLError, HTTPError, ContentTooShortError) as e: print("download error:"+str(e)) html = None return response.read().decode("UTF-8")def parser_html(response): soup = bs(response, 'html.parser') return soup# 打印信息def out_information(soup): # 记录爬取了多少数据 indexSpan = 1 # 打印出title标签的内容 print(soup.title) # 将a标签的信息存储在a列表里 for link in soup.find_all('a'): a.append(str(indexSpan) + link.text) indexSpan += 1 # 打印出a列表的内容 print(a)if __name__ == "__main__": # 需要访问的url url = "https://www.baidu.com/" # 下载url response = download(url) # 解析网页 soup = parser_html(response) # 打印内容 out_information(soup) 优化这段代码
时间: 2024-04-28 21:19:33 浏览: 90
urllib的使用以及360翻译爬取源码.md
这段代码已经很简洁了,但可以进一步优化:
1. 将导入的模块放在代码开头,便于查看和维护;
2. 在函数中使用异常处理来处理错误,提高代码的健壮性;
3. 将需要访问的url作为函数参数传入,提高代码的复用性。
优化后的代码如下:
```
import urllib.request as urllib2
from bs4 import BeautifulSoup as bs
from urllib.error import URLError, HTTPError, ContentTooShortError
def download(url):
print("download..." + url)
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"}
request = urllib2.Request(url, headers=head)
try:
response = urllib2.urlopen(request)
except (URLError, HTTPError, ContentTooShortError) as e:
print("download error:" + str(e))
return None
return response.read().decode("UTF-8")
def parser_html(response):
soup = bs(response, 'html.parser')
return soup
# 打印信息
def out_information(soup):
# 记录爬取了多少数据
indexSpan = 1
a = []
# 打印出title标签的内容
print(soup.title)
# 将a标签的信息存储在a列表里
for link in soup.find_all('a'):
a.append(str(indexSpan) + link.text)
indexSpan += 1
# 打印出a列表的内容
print(a)
if __name__ == "__main__":
# 需要访问的url
url = "https://www.baidu.com/"
# 下载url
response = download(url)
if response is not None:
# 解析网页
soup = parser_html(response)
# 打印内容
out_information(soup)
```
阅读全文