import requests import os from bs4 import BeautifulSoup class book_spider(): def init(self,root_url): self.root_url=root_url self.book_list=[] #一级页面中获取的数据（二级页面地址）存放在此列表 self.chapter_list=[] #二级页面中获取的数据（三级页面网址和章节名称）存放此列表 def get_url(self,url): while True: try: res=requests.get(url) if res.status_code==200: #返回http请求状态码 200为正常 res.encoding = res.apparent_encoding #页面编码方式使用页面声明的编码方式 print("页面获取成功！") return res.text else: print("页面返回异常！",res.status_code) except: print("页面获取错误！") def get_book_list(self,url): res =self.get_url(url) html=BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list=[self.root_url+i for i in self.book_list] self.book_list.remove("http://10.1.88.252:7000/庆余年") def get_chapter_list(self,url): res =self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"chapter"}) for a in a_list: self.chapter_list.append((a["href"],a.text.replace("\n",""))) def get_content(self,chapter): url =self.root_url +chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res =self.get_url(url) html = BeautifulSoup(res,"html.parser") content = html.find("div",{"id":"content"}).text print(content) path=os.path.join(book_name,chapter[1]) with open(path,"w",encoding="utf8")as f: f.write(content) def main(self): self.get_book_list(self.root_url) for book in self.book_list: self.get_chapter_list(book) for chapter in self.chapter_list: self.get_content(chapter) book_s= book_spider("http://10.1.88.252:7000") book_s.main()代码解读

时间: 2024-04-28 09:25:13 浏览: 181

这段代码是一个爬虫程序，其目的是爬取一本小说的章节内容并保存到本地文件中。首先定义了一个book_spider类，初始化时需要传入一个根URL作为参数。该类包含三个方法： 1. get_url(self, url)：获取指定URL的页面内容，并返回页面的HTML文本。 2. get_book_list(self, url)：获取一级页面中的数据（即小说章节列表），并将二级页面地址存入self.book_list列表中。 3. get_chapter_list(self, url)：获取二级页面中的数据（即章节名和章节内容所在的网址），并将其存入self.chapter_list列表中。另外还有一个主函数main()，其中调用了上述三个方法，遍历self.chapter_list列表中的每个元素（即章节名和网址），调用get_content()方法获取章节内容，并将其保存到本地文件中。整个爬虫程序的流程是先获取小说章节列表，再通过章节列表获取每个章节的内容并保存到本地文件中。

import requests import os from bs4 import BeautifulSoup class book_spider(): def init(self,root_url): self.root_url=root_url self.book_list=[] #一级页面中获取的数据（二级页面地址）存放于此列表 self.chapter_list=[] #二级页面中获取的数据（三级页面地址和章节名）存放于此列表 def get_url(url): while True: try: res=requests.get(url) if res.status_code==200: res.encoding =res.apparent_encoding print("页面获取成功") return res.text else: print("页面返回异常",res.status_code) except: print("页面获取错误") def get_book_list(self.url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url+i for i in self.bbok_list] self.book_list.remove('http://10.1.88.252:7000/庆余年') print(book_list) def get_chapter_list(self,url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"chapter"}) for a in a_list: self.chapter_list.append((a["href"],a.text.replace("\n",""))) def get_content(self.chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res,"html.parser") content = html.find("div",{"id":"content"}).text print(content) path = os.path.join(book_name,chapter[1]) with open(path,"w",encoding="utf8") as f: f.write(content) def main(): self.get_book_list(self.root_url) for book in self.book_list: self.get_chapter_liat(book) for chapter in chapter_list: self.get_content(chapter) book_s = book_spider("http://10.1.88.252:7000") book_s.main()这是一段爬虫代码，找出里面的错误并改正

有以下几个错误： 1. 在 `get_url` 方法中，需要在 `self` 参数之前添加 `@staticmethod` 装饰器，使其成为静态方法。 2. 在 `get_book_list` 方法中，`print(book_list)` 应该改为 `print(self.book_list)`，因为 `book_list` 是类成员变量。 3. 在 `get_book_list` 方法中，将 `self.bbok_list` 改为 `self.book_list`。 4. 在 `get_chapter_list` 方法中，将 `self.chapter_list.append((a["href"],a.text.replace("\n","")))` 改为 `self.chapter_list.append((self.root_url+a["href"],a.text.replace("\n","")))`，因为需要获取完整的章节链接。 5. 在 `get_content` 方法中，将 `print(url)` 和 `print(book_name)` 改为注释，因为这些是调试信息。改正后的代码如下： ``` import requests import os from bs4 import BeautifulSoup class book_spider(): def __init__(self,root_url): self.root_url=root_url self.book_list=[] self.chapter_list=[] @staticmethod def get_url(url): while True: try: res=requests.get(url) if res.status_code==200: res.encoding =res.apparent_encoding print("页面获取成功") return res.text else: print("页面返回异常",res.status_code) except: print("页面获取错误") def get_book_list(self,url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"name"}) for a in a_list: self.book_list.append(self.root_url + a["href"]) self.book_list.remove('http://10.1.88.252:7000/庆余年') print(self.book_list) def get_chapter_list(self,url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"chapter"}) for a in a_list: self.chapter_list.append((self.root_url+a["href"],a.text.replace("\n",""))) def get_content(self,chapter): url = chapter[0] book_name = chapter[0].split("/")[1] if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res,"html.parser") content = html.find("div",{"id":"content"}).text path = os.path.join(book_name,chapter[1]) with open(path,"w",encoding="utf8") as f: f.write(content) def main(self): self.get_book_list(self.root_url) for book in self.book_list: self.get_chapter_list(book) for chapter in self.chapter_list: self.get_content(chapter) book_s = book_spider("http://10.1.88.252:7000") book_s.main() ```

帮我把一下代码设置一个合理请求头，并加入一个延时import requests import os from bs4 import BeautifulSoup class NovelDownloader: def init(self, root_url): self.root_url = root_url self.book_list = [] self.chapter_list = [] def get_url(self, url): while True: try: res = requests.get(url) if res.status_code == 200: print("页面获取成功！") return res.text else: print("页面返回异常！", res.status_code) except: print("页面获取错误！") def get_book_list(self): res = self.get_url(self.root_url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url + i for i in self.book_list] self.book_list.remove('http://www.biquge5200.cc/') def get_chapter_list(self, url): res = self.get_url(url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "chapter"}) for a in a_list: self.chapter_list.append((a["href"], a.text.replace("\n", ""))) def get_content(self, chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res, "html.parser") content = html.find("div", {"id": "content"}).text print(content) path = os.path.join(book_name, chapter[1]) with open(path, "w", encoding="utf8") as f: f.write(content) def main(self): self.get_book_list() for book in self.book_list: self.get_chapter_list(book) for chapter in self.chapter_list: self.get_content(chapter) if name == 'main': root_url = "http://www.biquge5200.cc/" nd = NovelDownloader(root_url) nd.main()

好的，我可以帮你添加一个合理的请求头和延时。在 `get_url` 方法中，你可以添加一个请求头字典，来模拟浏览器发送请求。同时，在每次请求后添加一个时间延时，避免频繁请求被网站屏蔽。修改后的代码如下所示： ```python import requests import os import time from bs4 import BeautifulSoup class NovelDownloader: def __init__(self, root_url): self.root_url = root_url self.book_list = [] self.chapter_list = [] self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } # 添加请求头 def get_url(self, url): while True: try: res = requests.get(url, headers=self.headers) if res.status_code == 200: print("页面获取成功！") return res.text else: print("页面返回异常！", res.status_code) except: print("页面获取错误！") time.sleep(1) # 添加延时 def get_book_list(self): res = self.get_url(self.root_url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url + i for i in self.book_list] self.book_list.remove('http://www.biquge5200.cc/') def get_chapter_list(self, url): res = self.get_url(url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "chapter"}) for a in a_list: self.chapter_list.append((a["href"], a.text.replace("\n", ""))) def get_content(self, chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res, "html.parser") content = html.find("div", {"id": "content"}).text print(content) path = os.path.join(book_name, chapter[1]) with open(path, "w", encoding="utf8") as f: f.write(content) def main(self): self.get_book_list() for book in self.book_list: self.get_chapter_list(book) for chapter in self.chapter_list: self.get_content(chapter) if __name__ == '__main__': root_url = "http://www.biquge5200.cc/" nd = NovelDownloader(root_url) nd.main() ``` 这样就可以更好地模拟浏览器发送请求，避免被网站屏蔽。另外，可以根据需要调整延时时间，以达到更好的效果。

阅读全文

相关推荐

Requests_from_file-2.2: Python库文件下载与使用指南

Python库spider_utils-0.0.7：强大的数据抓取工具

Haproxy_exporter监控指标详解与示例

word源码java-baidu_paper_spider::spider:论文搜索引擎（含Scrapy-Redis分布式爬虫、Elasticsearch

music_spider_wo:某我音乐搜索下载爬虫

baidu_photo_spider:爬取百度图片

douban_list_spider:douban_list_spider.py是一个简单的爬虫，可以根据关键字抓取豆瓣电影、豆瓣读书或者豆瓣音乐的条目信息

douban_movie_spider:蜘蛛爬行豆瓣最好的250部电影

fake_book_web_scrape:一个名为books.toscrape.com的假书销售网站的网上抓取

spider_demo:使用requests和BeautifulSoup抓取页面

ebay_horror_movie_web_scrape:使用BeautifulSoup从eBay抓取恐怖电影数据

QM_Spider:QM_Spider

import reimport requestsfrom bs4 import BeautifulSoupimport t

zabbix_import_hosts:zabbix批量导入监控主机

MUST_ClassMenu_Spider

Njupt_AutoJudge_requests:分分钟解决期末测评

TFBS_extraction_from_JASPAR：访问JASPAR API

大家在看

GL3231S USB4.0读卡器Layout和原理图及相关的FW

keb变频器 f5中文说明书-维修安装调试

IPC-7351 使用说明

实验二DML语言一（数据插入、修改和删除.doc

ZYNQ_7020核心板原理图.pdf

最新推荐

如何基于python对接钉钉并获取access_token

Python爬虫实例_城市公交网络站点数据的爬取方法

免安装JDK 1.8.0_241：即刻配置环境运行

管理建模和仿真的文件

【提升效率与稳定性】：深入掌握单相整流器的控制策略

你看这是ashx映射的cs文件初始代码,你看这里边根本就没有写对action参数进行任何操作但你.ashx?action=submit这样去做他就能返回出数据这是为什么

机器学习预测葡萄酒评分：二值化品尝笔记的应用

"互动学习：行动中的多样性与论文攻读经历"

【单相整流器终极指南】：电气工程师的20年实用技巧大揭秘

OxyPlot CategoryAxis