import urllib.request as urllib2from bs4 import BeautifulSoup as bsfrom urllib.error import URLError, HTTPError, ContentTooShortErrora = []def download(url): print("download..."+url) try: html = urllib2.urlopen(url) except(URLError, HTTPError, ContentTooShortError) as e: print("download error:"+e.reason) html = None return htmldef parser_html(response): soup = bs(response, 'html.parser') return soup# 打印信息def out_information(soup): # 记录爬取了多少数据 indexSpan = 1 # 打印出title标签的内容 print(soup.title.string) # 将a标签的信息存储在a列表里 for link in soup.find_all('a'): a.append(str(indexSpan) + link.text) indexSpan += 1 # 打印出a列表的内容 print(a)if __name__ == "__main__": # 需要访问的url url = "https://www.baidu.com/" # 下载url response = download(url) # 解析网页 soup = parser_html(response) # 打印内容 out_information(soup)
时间: 2024-04-28 22:20:18 浏览: 152
这段代码尝试从指定的URL下载页面,然后使用BeautifulSoup来解析页面内容,最后打印出页面的title标签和所有链接(a标签)的文本内容。
不过,这段代码有几个问题:
1. `urllib2` 库的导入语句应该写成 `import urllib.request as urllib2`,否则会报错。
2. 在 `except` 语句中,`e.reason`应该改为`str(e)`或者`e.reason`改为`e.reason.strerror`,否则也会报错。
3. 在 `out_information` 函数中,变量 `a` 没有被初始化,需要在函数前面加上 `a = []` 进行初始化。
下面是修改后的代码:
```
import urllib.request as urllib2
from bs4 import BeautifulSoup as bs
from urllib.error import URLError, HTTPError, ContentTooShortError
a = []
def download(url):
print("download..."+url)
try:
html = urllib2.urlopen(url)
except (URLError, HTTPError, ContentTooShortError) as e:
print("download error:" + str(e))
html = None
return html
def parser_html(response):
soup = bs(response, 'html.parser')
return soup
# 打印信息
def out_information(soup):
# 记录爬取了多少数据
indexSpan = 1
# 打印出title标签的内容
print(soup.title.string)
# 将a标签的信息存储在a列表里
for link in soup.find_all('a'):
a.append(str(indexSpan) + link.text)
indexSpan += 1
# 打印出a列表的内容
print(a)
if __name__ == "__main__":
# 需要访问的url
url = "https://www.baidu.com/"
# 下载url
response = download(url)
# 解析网页
soup = parser_html(response)
# 打印内容
out_information(soup)
```
阅读全文