import requests import time from bs4 import BeautifulSoup import csv #伪装头部 list1 = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'} #利用for循环去获取多页网页网址,并抓取 for i in range(0,10): link1 = "https://www.tzvcst.edu.cn/Home/list/xyyw?page="+str(i+1) r1 = requests.get(link1,headers = headers) r1.encoding = 'utf-8' #测试网页是否抓取 # print(r.text) #time.sleep(3) #抓取标签 soup1 = BeautifulSoup(r1.text,'lxml') # print(soup1) for j in range: div_list1 = soup1.find_all("div",class_="list-t")#标签加时间 # print(div_list1) for eachone in div_list1: print(eachone.text) eachtwo = eachone.li.a['href'] #print("https://www.nchs.net.cn"+eachtwo)
时间: 2024-02-26 13:53:11 浏览: 92
这段代码是用Python爬取某个网站的文章标题和链接。使用requests库发送HTTP请求获取网页内容,使用BeautifulSoup库解析网页内容,找到对应的标签和属性,再把所需的信息提取出来。其中headers是伪装头部,可以避免被网站识别为爬虫。for循环用来获取多页网页网址,range函数用来生成数字序列,str函数将数字转换为字符串,再拼接成完整的网址。最后,每个文章的标题和链接被打印出来。
相关问题
import requests from bs4 import BeautifulSoup import csv # 请求URL url = 'https://s.weibo.com/top/summary' # 请求头部信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} # 发送请求 response = requests.get(url, headers=headers) # 解析HTML页面 soup = BeautifulSoup(response.text, 'lxml') # 获取热搜列表 hot_list = soup.find_all('tr')[1:11] # 输出展示和保存到csv文件 with open('weibo_hot.csv', 'w', newline='', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) writer.writerow(['排名', '标题', '日期', '内容']) for i, item in enumerate(hot_list): title = item.find_all('a')[0].text date = item.find_all('td')[1].text content = item.find_all('td')[2].text.strip() writer.writerow([i+1, title, date, content]) print(f"{i+1}. 标题:{title} 日期:{date} 内容:{content}")
这是一个爬取微博热搜榜的Python程序。首先,使用requests库发送GET请求获取HTML页面,然后使用BeautifulSoup库解析HTML页面。接着,找到热搜列表所在的HTML标签,并且用find_all方法获取前10个热搜。最后,将热搜的排名、标题、日期和内容写入CSV文件,并且在控制台输出热搜的详细信息。
优化代码:import requests from bs4 import BeautifulSoup import csv # 请求URL url = "https://pvp.qq.com/web201605/herodetail/527.shtml" # 请求头部信息 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"} # 发送请求 response = requests.get(url, headers=headers) # 解析HTML soup = BeautifulSoup(response.content, "html.parser") # 获取所有英雄的链接 hero_links = [] for hero in soup.select(".herolist > li > a"): hero_links.append(hero["href"]) # 爬取每个英雄的属性 heroes = [] for link in hero_links: response = requests.get(link, headers=headers) soup = BeautifulSoup(response.content, "html.parser") # 获取英雄属性 name = soup.select(".cover-name")[0].text survive = soup.select(".")[0].text attack = soup.select(".cover-list-bar data-bar2 fl")[0].text skill = soup.select(".skill")[0].text difficulty = soup.select(".difficulty")[0].text # 保存英雄属性 heroes.append({"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty}) # 将数据写入CSV文件 with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile: fieldnames = ["name", "survive", "attack", "skill", "difficulty"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # 写入表头 writer.writeheader() # 写入数据 for hero in heroes: writer.writerow(hero)
可以尝试使用多线程或异步的方式来提高爬取效率,比如使用`concurrent.futures`库中的`ThreadPoolExecutor`来实现多线程爬取。同时,可以对代码进行优化,减少不必要的HTTP请求,比如将`hero_links`列表中的链接去重,避免重复爬取同一个英雄的属性。另外,可以将英雄属性的选择器封装成一个函数,提高代码的可读性和可维护性。优化后的代码如下:
```python
import csv
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
# 请求头部信息
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"
}
# 获取英雄属性
def get_hero_attrs(link):
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
name = soup.select(".cover-name")[0].text
survive = soup.select(".")[0].text
attack = soup.select(".cover-list-bar data-bar2 fl")[0].text
skill = soup.select(".skill")[0].text
difficulty = soup.select(".difficulty")[0].text
return {"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty}
# 请求URL
url = "https://pvp.qq.com/web201605/herodetail/527.shtml"
# 发送请求
response = requests.get(url, headers=headers)
# 解析HTML
soup = BeautifulSoup(response.content, "html.parser")
# 获取所有英雄的链接,并去重
hero_links = set(hero["href"] for hero in soup.select(".herolist > li > a"))
# 爬取每个英雄的属性
heroes = []
with ThreadPoolExecutor(max_workers=8) as executor:
hero_attrs = executor.map(get_hero_attrs, hero_links)
for attr in hero_attrs:
heroes.append(attr)
# 将数据写入CSV文件
with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
fieldnames = ["name", "survive", "attack", "skill", "difficulty"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 写入表头
writer.writeheader()
# 写入数据
for hero in heroes:
writer.writerow(hero)
```
阅读全文