优化代码:import requests from bs4 import BeautifulSoup import csv # 请求URL url = "https://pvp.qq.com/web201605/herodetail/527.shtml" # 请求头部信息 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"} # 发送请求 response = requests.get(url, headers=headers) # 解析HTML soup = BeautifulSoup(response.content, "html.parser") # 获取所有英雄的链接 hero_links = [] for hero in soup.select(".herolist > li > a"): hero_links.append(hero["href"]) # 爬取每个英雄的属性 heroes = [] for link in hero_links: response = requests.get(link, headers=headers) soup = BeautifulSoup(response.content, "html.parser") # 获取英雄属性 name = soup.select(".cover-name")[0].text survive = soup.select(".")[0].text attack = soup.select(".cover-list-bar data-bar2 fl")[0].text skill = soup.select(".skill")[0].text difficulty = soup.select(".difficulty")[0].text # 保存英雄属性 heroes.append({"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty}) # 将数据写入CSV文件 with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile: fieldnames = ["name", "survive", "attack", "skill", "difficulty"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # 写入表头 writer.writeheader() # 写入数据 for hero in heroes: writer.writerow(hero)
时间: 2023-07-03 10:25:06 浏览: 345
可以尝试使用多线程或异步的方式来提高爬取效率,比如使用`concurrent.futures`库中的`ThreadPoolExecutor`来实现多线程爬取。同时,可以对代码进行优化,减少不必要的HTTP请求,比如将`hero_links`列表中的链接去重,避免重复爬取同一个英雄的属性。另外,可以将英雄属性的选择器封装成一个函数,提高代码的可读性和可维护性。优化后的代码如下:
```python
import csv
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
# 请求头部信息
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"
}
# 获取英雄属性
def get_hero_attrs(link):
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
name = soup.select(".cover-name")[0].text
survive = soup.select(".")[0].text
attack = soup.select(".cover-list-bar data-bar2 fl")[0].text
skill = soup.select(".skill")[0].text
difficulty = soup.select(".difficulty")[0].text
return {"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty}
# 请求URL
url = "https://pvp.qq.com/web201605/herodetail/527.shtml"
# 发送请求
response = requests.get(url, headers=headers)
# 解析HTML
soup = BeautifulSoup(response.content, "html.parser")
# 获取所有英雄的链接,并去重
hero_links = set(hero["href"] for hero in soup.select(".herolist > li > a"))
# 爬取每个英雄的属性
heroes = []
with ThreadPoolExecutor(max_workers=8) as executor:
hero_attrs = executor.map(get_hero_attrs, hero_links)
for attr in hero_attrs:
heroes.append(attr)
# 将数据写入CSV文件
with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
fieldnames = ["name", "survive", "attack", "skill", "difficulty"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 写入表头
writer.writeheader()
# 写入数据
for hero in heroes:
writer.writerow(hero)
```
阅读全文