import requests import time from bs4 import BeautifulSoup import csv #伪装头部 list1 = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'} #利用for循环去获取多页网页网址，并抓取 for i in range(0,10): link1 = "https://www.tzvcst.edu.cn/Home/list/xyyw?page="+str(i+1) r1 = requests.get(link1,headers = headers) r1.encoding = 'utf-8' #测试网页是否抓取 # print(r.text) #time.sleep(3) #抓取标签 soup1 = BeautifulSoup(r1.text,'lxml') # print(soup1) for j in range: div_list1 = soup1.find_all("div",class_="list-t")#标签加时间 # print(div_list1) for eachone in div_list1: print(eachone.text) eachtwo = eachone.li.a['href'] #print("https://www.nchs.net.cn"+eachtwo)

时间: 2024-02-26 21:53:11 浏览: 128

这段代码是用Python爬取某个网站的文章标题和链接。使用requests库发送HTTP请求获取网页内容，使用BeautifulSoup库解析网页内容，找到对应的标签和属性，再把所需的信息提取出来。其中headers是伪装头部，可以避免被网站识别为爬虫。for循环用来获取多页网页网址，range函数用来生成数字序列，str函数将数字转换为字符串，再拼接成完整的网址。最后，每个文章的标题和链接被打印出来。

import requests from bs4 import BeautifulSoup import csv # 请求URL url = 'https://s.weibo.com/top/summary' # 请求头部信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} # 发送请求 response = requests.get(url, headers=headers) # 解析HTML页面 soup = BeautifulSoup(response.text, 'lxml') # 获取热搜列表 hot_list = soup.find_all('tr')[1:11] # 输出展示和保存到csv文件 with open('weibo_hot.csv', 'w', newline='', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) writer.writerow(['排名', '标题', '日期', '内容']) for i, item in enumerate(hot_list): title = item.find_all('a')[0].text date = item.find_all('td')[1].text content = item.find_all('td')[2].text.strip() writer.writerow([i+1, title, date, content]) print(f"{i+1}. 标题：{title} 日期：{date} 内容：{content}")

这是一个爬取微博热搜榜的Python程序。首先，使用requests库发送GET请求获取HTML页面，然后使用BeautifulSoup库解析HTML页面。接着，找到热搜列表所在的HTML标签，并且用find_all方法获取前10个热搜。最后，将热搜的排名、标题、日期和内容写入CSV文件，并且在控制台输出热搜的详细信息。

优化代码：import requests from bs4 import BeautifulSoup import csv # 请求URL url = "https://pvp.qq.com/web201605/herodetail/527.shtml" # 请求头部信息 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"} # 发送请求 response = requests.get(url, headers=headers) # 解析HTML soup = BeautifulSoup(response.content, "html.parser") # 获取所有英雄的链接 hero_links = [] for hero in soup.select(".herolist > li > a"): hero_links.append(hero["href"]) # 爬取每个英雄的属性 heroes = [] for link in hero_links: response = requests.get(link, headers=headers) soup = BeautifulSoup(response.content, "html.parser") # 获取英雄属性 name = soup.select(".cover-name")[0].text survive = soup.select(".")[0].text attack = soup.select(".cover-list-bar data-bar2 fl")[0].text skill = soup.select(".skill")[0].text difficulty = soup.select(".difficulty")[0].text # 保存英雄属性 heroes.append({"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty}) # 将数据写入CSV文件 with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile: fieldnames = ["name", "survive", "attack", "skill", "difficulty"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # 写入表头 writer.writeheader() # 写入数据 for hero in heroes: writer.writerow(hero)

可以尝试使用多线程或异步的方式来提高爬取效率，比如使用`concurrent.futures`库中的`ThreadPoolExecutor`来实现多线程爬取。同时，可以对代码进行优化，减少不必要的HTTP请求，比如将`hero_links`列表中的链接去重，避免重复爬取同一个英雄的属性。另外，可以将英雄属性的选择器封装成一个函数，提高代码的可读性和可维护性。优化后的代码如下： ```python import csv import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor # 请求头部信息 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57" } # 获取英雄属性 def get_hero_attrs(link): response = requests.get(link, headers=headers) soup = BeautifulSoup(response.content, "html.parser") name = soup.select(".cover-name")[0].text survive = soup.select(".")[0].text attack = soup.select(".cover-list-bar data-bar2 fl")[0].text skill = soup.select(".skill")[0].text difficulty = soup.select(".difficulty")[0].text return {"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty} # 请求URL url = "https://pvp.qq.com/web201605/herodetail/527.shtml" # 发送请求 response = requests.get(url, headers=headers) # 解析HTML soup = BeautifulSoup(response.content, "html.parser") # 获取所有英雄的链接，并去重 hero_links = set(hero["href"] for hero in soup.select(".herolist > li > a")) # 爬取每个英雄的属性 heroes = [] with ThreadPoolExecutor(max_workers=8) as executor: hero_attrs = executor.map(get_hero_attrs, hero_links) for attr in hero_attrs: heroes.append(attr) # 将数据写入CSV文件 with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile: fieldnames = ["name", "survive", "attack", "skill", "difficulty"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # 写入表头 writer.writeheader() # 写入数据 for hero in heroes: writer.writerow(hero) ```

阅读全文

相关推荐

'''模拟浏览器头部信息'''headers = 'User-Agent': 'Mozilla/5.0 (

import reimport requestsfrom bs4 import BeautifulSoupimport t

requests-random-user-agent:配置请求库以随机选择桌面用户代理

怎样用python的beautifulsoup抓取https://detail.zol.com.cn/gpswatch/huawei/所有页数据

改成获取这个网站的数据https://www.yoojia.com/rank/1-0-0-0-0-0.ht

爬取http://www.ccgp.gov.cn/cr/list网站的所有信息并写入csv文件中？

爬取上海肯德基所有门店数据，包括餐厅名称、餐厅地址、详情信息，存入csv文件中。** 目标网页地址：https://www.kfc.com.cn/kfccda/storelist/index.aspx

用python编写爬取海南招标网站中标公告的代码，爬取页数为10，字段为链接和标题和发布时间和中标金额和全文，并写入excel或csv，网址为https://www.ccgp-hainan.gov.cn/cgw/cgw_list.jsp

大家在看

ASP.NET在线播放器代码大全

Keysight N6705C直流电源分析仪.pdf

CAD二次开发-界面加载框架-代码模板

zotero各种插件，包含翻译，预览，文献管理，影响因子等等

大型滑坡变形稳定性与降雨关系研究

最新推荐

基于遗传算法的动态优化物流配送中心选址问题研究（Matlab源码+详细注释）,遗传算法与免疫算法在物流配送中心选址问题的应用详解（源码+详细注释，Matlab编写，含动态优化与迭代，结果图展示）,遗传

PHP集成Autoprefixer让CSS自动添加供应商前缀

揭秘数字音频编码的奥秘：非均匀量化A律13折线的全面解析

arduino PAJ7620U2

网站啄木鸟：深入分析SQL注入工具的效率与限制

【GPStoolbox使用技巧大全】：20个实用技巧助你精通GPS数据处理

spring boot怎么配置maven

我的个人简历HTML模板解析与应用

3GPP架构深度解析：掌握网络功能与服务框架的关键

Failed to restart vntoolsd.service: Unit vntoolsd.service not found.