import requests from bs4 import BeautifulSoup def getHTMLText(url): try: r = requests.get(url,timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "产生异常" def jiexi(html_doc): soup = BeautifulSoup(html_doc,"html.parser") links = soup.find_all('a') for link in links: print(link.name,link["nref"],link.get.next()) if __name__ == 'main': url = "https://wallhaven.cc/toplist?page=7" print(getHTMLText(url))
时间: 2024-04-27 11:21:18 浏览: 7
这段代码是用Python编写的,主要功能是从指定的网页上获取HTML页面内容,并且使用BeautifulSoup库解析HTML页面,提取其中的所有超链接,并输出超链接的名称、地址和文本内容。如果获取HTML页面内容失败,则返回字符串"产生异常"。其中,if __name__ == 'main': 表示当这个.py文件被作为主程序运行时,才会执行下面的代码。
相关问题
import requests from bs4 import BeautifulSoup def getHTMLText(url): try: r = requests.get(url,timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "产生异常" def jiexi(html_doc): soup = BeautifulSoup(html_doc,"html.parser") links = soup.find_all('a') for link in links: print(link.name,link["nref"],link.get.next()) if __name__ == 'main': url = "https://wallhaven.cc/toplist?page=7" print(getHTMLText(url))改进代码
这段代码可以改进的地方有:
1. 函数名jiexi不太直观,可以改为更明确的parse_links。
2. 在解析链接时,可以加上判断,如果链接是以http或https开头的,才输出链接信息,否则忽略。
3. 在主程序中,应该调用parse_links函数,而不是直接打印HTML页面内容。
改进后的代码如下:
```
import requests
from bs4 import BeautifulSoup
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常"
def parse_links(html_doc):
soup = BeautifulSoup(html_doc, "html.parser")
links = soup.find_all('a')
for link in links:
if link.has_attr('href') and (link['href'].startswith('http') or link['href'].startswith('https')):
print(link.name, link['href'], link.get_text())
if __name__ == '__main__':
url = "https://wallhaven.cc/toplist?page=7"
html_text = get_html_text(url)
parse_links(html_text)
```
这样修改后,代码更加清晰易懂,功能也更加完善。
import requests from bs4 import BeautifulSoup allUniv = [] def getHTMLText(url): try: r = requests.get(url,timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return "" def fillUnivList(soup): data = soup.find_all('tr') for tr in data: ltd = tr.find_all('td') if len(ltd) == 0: continue singleUniv = [] for td in ltd: singleUniv.append(td.string) allUniv.append(singleUniv) def printUnivList(num): print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名","学校名称","省份","总分","培养规模")) for i in range(num): u=allUniv[i] print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0],u[1],u[2],u[3],u[6])) def main(num): url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html' html = getHTMLText(url) soup = BeautifulSoup(html,"html.parser") fillUnivList(soup) printUnivList(num) main(10)数组超出范围
根据你提供的代码,问题出现在访问 `allUniv` 数组时,数组索引超出了范围。具体来说,当 `num` 大于 `allUniv` 的长度时,执行 `printUnivList` 函数时会抛出 `IndexError` 异常。
你可以在调用 `printUnivList` 函数前,先判断一下 `num` 是否超出了 `allUniv` 的长度,如果超出了则将 `num` 赋值为 `allUniv` 的长度,这样就可以避免数组越界的问题。修改后的代码如下:
```python
import requests
from bs4 import BeautifulSoup
allUniv = []
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def fillUnivList(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd) == 0:
continue
singleUniv = []
for td in ltd:
singleUniv.append(td.string)
allUniv.append(singleUniv)
def printUnivList(num):
if num > len(allUniv):
num = len(allUniv) # 将 num 赋值为 allUniv 的长度
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名", "学校名称", "省份", "总分", "培养规模"))
for i in range(num):
u = allUniv[i]
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0], u[1], u[2], u[3], u[6]))
def main(num):
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
fillUnivList(soup)
printUnivList(num)
main(10)
```
这样就可以避免数组越界的问题了。