import requests from bs4 import BeautifulSoup import openpyxl class LianJiaSpider(): def __init__(self): self.url = 'https://bj.lianjia.com/ershoufang/pg{0}/' self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.12022 SLBChan/109'} def send_request(self, url): resp = requests.get(url, headers=self.headers) if resp.status_code == 200: return resp def parse_html(self, resp): lst = [] html = resp.text bs = BeautifulSoup(html, 'lxml') ul = bs.find('ul', class_='sellListContent') li_list = ul.find_all('li') for item in li_list: title = item.find('div', class_='title').text positionInfo = item.find('div', class_='positionInfo').text address = item.find('div', class_='address').text followInfo = item.find('div', class_='followInfo').text tag = item.find('div', class_='tag').text totalPrice = item.find('div', class_='totalPrice totalPrice2').text unitPrice = item.find('div', class_='unitPrice').text # print(unitPrice) lst.append((title, positionInfo, address, followInfo, tag, totalPrice, unitPrice)) print(lst) self.save(lst) def save(self, lst): wb = openpyxl.Workbook() sheet = wb.active for row in lst: sheet.append(row) continue wb.save('D:/爬虫/链家.csv') def start(self): for i in range(1, 5): full_url = self.url.format(i) resp = self.send_request(full_url) #print(resp.text) self.parse_html(resp) if __name__ == '__main__': lianjia = LianJiaSpider() lianjia.start()使用以上代码爬取数据保存到文件中只显示最后一页30条数据，前面页码的数据都被覆盖了，如何更改

帮我把一下代码设置一个合理请求头，并加入一个延时import requests import os from bs4 import BeautifulSoup class NovelDownloader: def init(self, root_url): self.root_url = root_url self.book_list = [] self.chapter_list = [] def get_url(self, url): while True: try: res = requests.get(url) if res.status_code == 200: print("页面获取成功！") return res.text else: print("页面返回异常！", res.status_code) except: print("页面获取错误！") def get_book_list(self): res = self.get_url(self.root_url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url + i for i in self.book_list] self.book_list.remove('http://www.biquge5200.cc/') def get_chapter_list(self, url): res = self.get_url(url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "chapter"}) for a in a_list: self.chapter_list.append((a["href"], a.text.replace("\n", ""))) def get_content(self, chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res, "html.parser") content = html.find("div", {"id": "content"}).text print(content) path = os.path.join(book_name, chapter[1]) with open(path, "w", encoding="utf8") as f: f.write(content) def main(self): self.get_book_list() for book in self.book_list: self.get_chapter_list(book) for chapter in self.chapter_list: self.get_content(chapter) if name == 'main': root_url = "http://www.biquge5200.cc/" nd = NovelDownloader(root_url) nd.main()

from bs4 import BeautifulSoup class NovelDownloader: def __init__(self, root_url): self.root_url = root_url self.book_list = [] self.chapter_list = [] self.headers = { "User-Agent": "Mozilla/...

import requests import os from bs4 import BeautifulSoup class book_spider(): def init(self,root_url): self.root_url=root_url self.book_list=[] #一级页面中获取的数据（二级页面地址）存放于此列表 self.chapter_list=[] #二级页面中获取的数据（三级页面地址和章节名）存放于此列表 def get_url(url): while True: try: res=requests.get(url) if res.status_code==200: res.encoding =res.apparent_encoding print("页面获取成功") return res.text else: print("页面返回异常",res.status_code) except: print("页面获取错误") def get_book_list(self.url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url+i for i in self.bbok_list] self.book_list.remove('http://10.1.88.252:7000/庆余年') print(book_list) def get_chapter_list(self,url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"chapter"}) for a in a_list: self.chapter_list.append((a["href"],a.text.replace("\n",""))) def get_content(self.chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res,"html.parser") content = html.find("div",{"id":"content"}).text print(content) path = os.path.join(book_name,chapter[1]) with open(path,"w",encoding="utf8") as f: f.write(content) def main(): self.get_book_list(self.root_url) for book in self.book_list: self.get_chapter_liat(book) for chapter in chapter_list: self.get_content(chapter) book_s = book_spider("http://10.1.88.252:7000") book_s.main()这是一段爬虫代码，找出里面的错误并改正

from bs4 import BeautifulSoup class book_spider(): def __init__(self,root_url): self.root_url=root_url self.book_list=[] self.chapter_list=[] @staticmethod def get_url(url): while True: try...

from fileinput import filename from lib2to3.pgen2 import driver from xml.etree.ElementInclude import include from selenium import webdriver from bs4 import BeautifulSoup import time from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import requests class Downloader(object): def init(self, url): self.url = url self.urlist = [] self.DownloadUrl() # 下载链接 def DownloadUrl(self): driver = webdriver.Chrome() driver.maximize_window() driver.get(url) time.sleep(5) sp = driver.find_elements(By.XPATH, "//video[@class='wbpv-tech']").get_attribute("src") print("开始保存链接……%s" % sp) self.SavePicture(sp) # 保存图片到本地 def SavePicture(self, sp): driver2 = webdriver.Chrome() driver2.maximize_window() time.sleep(5) img = driver2.get(sp) with open("photo.mp4", "wb") as f: f.write(requests.get(sp).content) print("保存完成") if name == "main": url = 'https://s.weibo.com/weibo?q=%23%E5%A4%A9%E5%92%8C%E8%A7%86%E8%A7%92%E4%B8%8B%E7%9A%84%E7%A5%9E%E5%8D%81%E4%B8%89%E6%92%A4%E7%A6%BB%23&Refer=top' Downloader(url)

其中，class Downloader 是一个包含了初始化函数和下载链接函数的类，它在初始化时传入一个url参数，并在 DownloadUrl 函数中通过使用 Selenium 打开浏览器获取视频的链接，最后在 SavePicture 函数中使用 requests ...

BeautifulSoup速成课：2小时精通HTML_XML文档解析

[BeautifulSoup速成课：2小时精通HTML_XML文档解析](https://www.jeveuxetredatascientist.fr/wp-content/uploads/2022/06/BeautifulSoup-1080x428.jpg) # 1. BeautifulSoup解析基础入门本章将带您进入...

BeautifulSoup高级玩家技巧：提升解析效率与性能优化

[BeautifulSoup高级玩家技巧：提升解析效率与性能优化](https://img-blog.csdnimg.cn/20201221215514430.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0...

【深度剖析】：weipu_qikan_spider源码，专家级爬虫优化策略

# 摘要随着网络信息的指数级增长，网络爬虫技术的应用变得越来越广泛。本文首先介绍了weipu_qikan_spider源码的基本情况，随后深入探讨了爬虫技术的基础和实践，包括请求响应机制、网页解析技术、数据提取与存储...

【Python爬虫必备】：weipu_qikan_spider实战指南，提升数据抓取效率

# 摘要随着互联网信息的爆炸性增长，Python爬虫技术已成为自动化获取网络数据的重要手段。本文旨在提供对Python爬虫技术的基础介绍和实践指导，从weipu_qikan_spider框架的详尽解析开始，到使用该框架进行数据抓取...

网络爬虫实战：requests、BeautifulSoup、Scrapy的综合运用

requests、BeautifulSoup、Scrapy的综合运用](https://img-blog.csdnimg.cn/20200223002339879.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2...

Python bs4深度解析：掌握HTML_XML文档高效解析的5大技巧

![Python bs4深度解析：掌握HTML_XML文档高效解析的5大技巧](https://img-blog.csdnimg.cn/20190120164642154.png?x-oss-process=image/watermark,type_...Python的第三方库bs4（BeautifulSoup4）是一个强大的库，用于

多线程_异步IO结合使用：提升BeautifulSoup项目效率

[多线程_异步IO结合使用：提升BeautifulSoup项目效率](https://img-blog.csdnimg.cn/20210811201819239.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0...

深入解析BeautifulSoup源码：揭示其强大的内部机制

[深入解析BeautifulSoup源码：揭示其强大的内部机制](https://cdn.educba.com/academy/wp-content/uploads/2022/10/Beautifulsoup-lxml.jpg) # 1. BeautifulSoup入门与基本使用在这一章中，我们将介绍如何开始使用...

【并发编程与列表】：Python列表在多线程_多进程中的应用与注意事项

!...# 1. 并发编程基础与Python列表概览并发编程是现代软件开发中一个重要的领域，它允许程序同时执行多个任务，极大地提高了应用程序的效率和响应能力。在Python中，列表作为一种内置的数据结构，它在并发编程中扮演...

1.访问 http://10.0.200.3:8089/goods_list，完成以下操作: (1)实现商品类，至少包含商品名称、商品价格、商品封面地址等属性(2)编写爬虫实现抓取全部商品，分别使用商品类保存(3)后台打印抓取的商品总条数及商品详情写出代码

from bs4 import BeautifulSoup from typing import List # 定义商品类 class Product: def __init__(self, name: str, price: float, cover_url: str): self.name = name self.price = price self.cover_url =...

网址：https://www.sizhengwang.cn/szzx/index.shtml 解析“工作动态栏止下的所有新闻标题，并翻页爬取1-5页的新闻标题和新闻来源。格式化打印输出所有标题及来源。Python不使用bs4

self.news_sources.append(data.strip()) # 假设来源信息紧跟在标题之后 def fetch_and_parse_news(url): response = requests.get(url) parser = MyHTMLParser() parser.feed(response.text) for i in range...

通过类继承，实现一个线程类，从https://www.51voa.com/VOA_Standard_3.html (其中 "3"可被替换为其他数字，对应翻页操作)中获取新的链接地址列表。

from bs4 import BeautifulSoup import threading class LinkGetter(threading.Thread): def __init__(self, page_num): threading.Thread.__init__(self) self.page_num = page_num self.links = [] def run...

1. 通过类继承，实现一个线程类，从https://www.51voa.com/VOA_Standard_3.html (其中 "3"可被替换为其他数字，对应翻页操作)中获取新的链接地址列表。2. 通过类继承，实现一个线程类，从1中获取的链接（如https://www.51voa.com/VOA_Standard_English/u-s-supports-diversity-of-energy-sources-in-europe-79541.html）获取mp3文件链接。 3. 通过类继承，实现一个线程类，利用2中的mp3文件链接（如https://files.51voa.cn/201806/fighting-tb-in-uzbekistan.mp3），将文件保存到本地。 4. 通过类继承，实现一个线程类，对存储的音频文件计算语速。 5. 设计一种同步策略（比如用线程池，或锁，或队列等），实现1，2，3，4中几种不同功能线程的配合，实现多线程的mp3文件下载功能，并进行语速的计算和输出。

from bs4 import BeautifulSoup from typing import List class LinkGetterThread(Thread): def __init__(self, page_number: int): super().__init__() self.page_number = page_number self.links = [] def...

面向对象用python登录以下网站https://jywg.18.cn/Login?el=1&clear=&returl=%2fTrade%2fBuy

from bs4 import BeautifulSoup class Login(): def __init__(self, username, password): self.username = username self.password = password self.session = requests.Session() def login(self): url = ...

相关推荐

import sys import os import urllib from bs4 import BeautifulSoup

import reimport requestsfrom bs4 import BeautifulSoupimport t

python3_cheatsheet:带有python常用术语和示例的网页

BeautifulSoup速成课：2小时精通HTML_XML文档解析

BeautifulSoup高级玩家技巧：提升解析效率与性能优化

【深度剖析】：weipu_qikan_spider源码，专家级爬虫优化策略

【Python爬虫必备】：weipu_qikan_spider实战指南，提升数据抓取效率

网络爬虫实战：requests、BeautifulSoup、Scrapy的综合运用

Python bs4深度解析：掌握HTML_XML文档高效解析的5大技巧

多线程_异步IO结合使用：提升BeautifulSoup项目效率

深入解析BeautifulSoup源码：揭示其强大的内部机制

【并发编程与列表】：Python列表在多线程_多进程中的应用与注意事项

1.访问 http://10.0.200.3:8089/goods_list，完成以下操作: (1)实现商品类，至少包含商品名称、商品价格、商品封面地址等属性(2)编写爬虫实现抓取全部商品，分别使用商品类保存(3)后台打印抓取的商品总条数及商品详情 写出代码

网址：https://www.sizhengwang.cn/szzx/index.shtml 解析“工作动态栏止下的所有新闻标题，并翻页爬取1-5页的新闻标题和新闻来源。 格式化打印输出所有标题及来源。Python不使用bs4

通过类继承，实现一个线程类， 从https://www.51voa.com/VOA_Standard_3.html (其中 "3"可被替换为其他数字，对应翻页操作)中获取新的链接地址列表。

面向对象用python登录以下网站https://jywg.18.cn/Login?el=1&clear=&returl=%2fTrade%2fBuy

大家在看

RTL8197F_Datasheet-V3.2.pdf

05-北京迅为itop-3568开发板源码编译手册【底板v1.7版】v1.4

Stateflow建模规范

YUV色彩空间深入浅出

ultrascale-plus-fpga-product-selection-guide.pdf

最新推荐

Python Requests模拟登录实现图书馆座位自动预约

租赁合同编写指南及下载资源

【项目管理精英必备】：信息系统项目管理师教程习题深度解析（第四版官方教材全面攻略）

最具代表性的改进过的UNet有哪些？

惠普P1020Plus驱动下载：办公打印新选择

数字电路实验技巧：10大策略，让你的实验效率倍增！

altium designer布线

Rust与OpenGL共同打造的迷宫游戏

数字电路设计基础：9大技巧带你从理论飞跃到实践

ubuntu 安装opencv2

1.访问 http://10.0.200.3:8089/goods_list，完成以下操作: (1)实现商品类，至少包含商品名称、商品价格、商品封面地址等属性(2)编写爬虫实现抓取全部商品，分别使用商品类保存(3)后台打印抓取的商品总条数及商品详情写出代码

网址：https://www.sizhengwang.cn/szzx/index.shtml 解析“工作动态栏止下的所有新闻标题，并翻页爬取1-5页的新闻标题和新闻来源。格式化打印输出所有标题及来源。Python不使用bs4

通过类继承，实现一个线程类，从https://www.51voa.com/VOA_Standard_3.html (其中 "3"可被替换为其他数字，对应翻页操作)中获取新的链接地址列表。