import os if __name__ == '__main__': bookDict = {} bookList = [] with open(r'C:\Users\17250\Desktop\图书清单.csv','r') as readfp , open(r'C:\Users\17250\Desktop\result.txt','w') as writefp: title = tuple(readfp.readline().strip().split(',')) # # print(title,len(title)) for line in readfp.readlines(): try: linelist = line.strip('\n').split(',') # print(linelist) publisher = linelist[-4] # print("publisher:" + publisher) if publisher not in bookDict.keys(): bookDict[publisher] = 0 bookDict[publisher] += 1 except IndexError: pass bookList = [ (key,value) for key,value in bookDict.items() ] bookList.sort(key=lambda x:(-x[1],x[0])) for x in bookList: writefp.write("{}:{}\n".format(x[0],x[1]))
时间: 2024-02-15 15:27:05 浏览: 255
这段代码实现了将一个图书清单的出版社进行统计并按照出版数量从多到少进行排列,最终将排列结果输出到一个文本文件中。
具体的实现过程如下:
1. 导入 os 模块,准备进行文件操作。
2. 判断当前文件是否为主程序,如果是则继续执行,否则退出程序。
3. 初始化一个空的字典 bookDict 和一个空的列表 bookList,用于存储每个出版社的数量和排列结果。
4. 使用 with 语句打开指定路径下的 csv 文件和 txt 文件,分别用于读取图书清单数据和输出排列结果。
5. 读取 csv 文件的第一行,即标题行,并将其转换为元组 title。
6. 循环读取 csv 文件中的每一行数据,将其转换为以逗号为分隔符的列表类型 linelist。
7. 从 linelist 中获取出版社信息 publisher,并将其作为字典 bookDict 的键,如果该出版社在字典中不存在,则将其值初始化为 0,每次遇到该出版社则将其值加 1。
8. 如果遇到 IndexError 异常,则跳过该行数据的处理。
9. 将字典 bookDict 中的元素转换为元组类型,存入列表 bookList 中。
10. 对列表 bookList 进行排序,按照出版数量从多到少进行排列,如果出版数量相同,则按照出版社名称从小到大进行排列。
11. 循环遍历排列结果列表 bookList,将每个元素以“出版社名称:出版数量”的格式输出到 txt 文件中。
最终,该代码实现了对图书清单的出版社进行统计和排列,并将结果输出到 txt 文件中。
相关问题
帮我把一下代码设置一个合理请求头,并加入一个延时import requests import os from bs4 import BeautifulSoup class NovelDownloader: def __init__(self, root_url): self.root_url = root_url self.book_list = [] self.chapter_list = [] def get_url(self, url): while True: try: res = requests.get(url) if res.status_code == 200: print("页面获取成功!") return res.text else: print("页面返回异常!", res.status_code) except: print("页面获取错误!") def get_book_list(self): res = self.get_url(self.root_url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url + i for i in self.book_list] self.book_list.remove('http://www.biquge5200.cc/') def get_chapter_list(self, url): res = self.get_url(url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "chapter"}) for a in a_list: self.chapter_list.append((a["href"], a.text.replace("\n", ""))) def get_content(self, chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res, "html.parser") content = html.find("div", {"id": "content"}).text print(content) path = os.path.join(book_name, chapter[1]) with open(path, "w", encoding="utf8") as f: f.write(content) def main(self): self.get_book_list() for book in self.book_list: self.get_chapter_list(book) for chapter in self.chapter_list: self.get_content(chapter) if __name__ == '__main__': root_url = "http://www.biquge5200.cc/" nd = NovelDownloader(root_url) nd.main()
好的,我可以帮你添加一个合理的请求头和延时。在 `get_url` 方法中,你可以添加一个请求头字典,来模拟浏览器发送请求。同时,在每次请求后添加一个时间延时,避免频繁请求被网站屏蔽。
修改后的代码如下所示:
```python
import requests
import os
import time
from bs4 import BeautifulSoup
class NovelDownloader:
def __init__(self, root_url):
self.root_url = root_url
self.book_list = []
self.chapter_list = []
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
} # 添加请求头
def get_url(self, url):
while True:
try:
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
print("页面获取成功!")
return res.text
else:
print("页面返回异常!", res.status_code)
except:
print("页面获取错误!")
time.sleep(1) # 添加延时
def get_book_list(self):
res = self.get_url(self.root_url)
html = BeautifulSoup(res, "html.parser")
a_list = html.find_all("a", {"class": "name"})
for a in a_list:
self.book_list.append(a["href"])
self.book_list = [self.root_url + i for i in self.book_list]
self.book_list.remove('http://www.biquge5200.cc/')
def get_chapter_list(self, url):
res = self.get_url(url)
html = BeautifulSoup(res, "html.parser")
a_list = html.find_all("a", {"class": "chapter"})
for a in a_list:
self.chapter_list.append((a["href"], a.text.replace("\n", "")))
def get_content(self, chapter):
url = self.root_url + chapter[0]
print(url)
book_name = chapter[0].split("/")[1]
print(book_name)
if not os.path.exists(book_name):
os.mkdir(book_name)
res = self.get_url(url)
html = BeautifulSoup(res, "html.parser")
content = html.find("div", {"id": "content"}).text
print(content)
path = os.path.join(book_name, chapter[1])
with open(path, "w", encoding="utf8") as f:
f.write(content)
def main(self):
self.get_book_list()
for book in self.book_list:
self.get_chapter_list(book)
for chapter in self.chapter_list:
self.get_content(chapter)
if __name__ == '__main__':
root_url = "http://www.biquge5200.cc/"
nd = NovelDownloader(root_url)
nd.main()
```
这样就可以更好地模拟浏览器发送请求,避免被网站屏蔽。另外,可以根据需要调整延时时间,以达到更好的效果。
import requests import os from bs4 import BeautifulSoup class book_spider(): def __init__(self,root_url): self.root_url=root_url self.book_list=[] #一级页面中获取的数据(二级页面地址)存放于此列表 self.chapter_list=[] #二级页面中获取的数据(三级页面地址和章节名)存放于此列表 def get_url(url): while True: try: res=requests.get(url) if res.status_code==200: res.encoding =res.apparent_encoding print("页面获取成功") return res.text else: print("页面返回异常",res.status_code) except: print("页面获取错误") def get_book_list(self.url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url+i for i in self.bbok_list] self.book_list.remove('http://10.1.88.252:7000/庆余年') print(book_list) def get_chapter_list(self,url): res = self.get_url(url) html = BeautifulSoup(res,"html.parser") a_list = html.find_all("a",{"class":"chapter"}) for a in a_list: self.chapter_list.append((a["href"],a.text.replace("\n",""))) def get_content(self.chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res,"html.parser") content = html.find("div",{"id":"content"}).text print(content) path = os.path.join(book_name,chapter[1]) with open(path,"w",encoding="utf8") as f: f.write(content) def main(): self.get_book_list(self.root_url) for book in self.book_list: self.get_chapter_liat(book) for chapter in chapter_list: self.get_content(chapter) book_s = book_spider("http://10.1.88.252:7000") book_s.main()这是一段爬虫代码,找出里面的错误并改正
有以下几个错误:
1. 在 `get_url` 方法中,需要在 `self` 参数之前添加 `@staticmethod` 装饰器,使其成为静态方法。
2. 在 `get_book_list` 方法中,`print(book_list)` 应该改为 `print(self.book_list)`,因为 `book_list` 是类成员变量。
3. 在 `get_book_list` 方法中,将 `self.bbok_list` 改为 `self.book_list`。
4. 在 `get_chapter_list` 方法中,将 `self.chapter_list.append((a["href"],a.text.replace("\n","")))` 改为 `self.chapter_list.append((self.root_url+a["href"],a.text.replace("\n","")))`,因为需要获取完整的章节链接。
5. 在 `get_content` 方法中,将 `print(url)` 和 `print(book_name)` 改为注释,因为这些是调试信息。
改正后的代码如下:
```
import requests
import os
from bs4 import BeautifulSoup
class book_spider():
def __init__(self,root_url):
self.root_url=root_url
self.book_list=[]
self.chapter_list=[]
@staticmethod
def get_url(url):
while True:
try:
res=requests.get(url)
if res.status_code==200:
res.encoding =res.apparent_encoding
print("页面获取成功")
return res.text
else:
print("页面返回异常",res.status_code)
except:
print("页面获取错误")
def get_book_list(self,url):
res = self.get_url(url)
html = BeautifulSoup(res,"html.parser")
a_list = html.find_all("a",{"class":"name"})
for a in a_list:
self.book_list.append(self.root_url + a["href"])
self.book_list.remove('http://10.1.88.252:7000/庆余年')
print(self.book_list)
def get_chapter_list(self,url):
res = self.get_url(url)
html = BeautifulSoup(res,"html.parser")
a_list = html.find_all("a",{"class":"chapter"})
for a in a_list:
self.chapter_list.append((self.root_url+a["href"],a.text.replace("\n","")))
def get_content(self,chapter):
url = chapter[0]
book_name = chapter[0].split("/")[1]
if not os.path.exists(book_name):
os.mkdir(book_name)
res = self.get_url(url)
html = BeautifulSoup(res,"html.parser")
content = html.find("div",{"id":"content"}).text
path = os.path.join(book_name,chapter[1])
with open(path,"w",encoding="utf8") as f:
f.write(content)
def main(self):
self.get_book_list(self.root_url)
for book in self.book_list:
self.get_chapter_list(book)
for chapter in self.chapter_list:
self.get_content(chapter)
book_s = book_spider("http://10.1.88.252:7000")
book_s.main()
```
阅读全文