import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor url_template = 'https://book.douban.com/tag/编程?start={}&type=T' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} def get_book_list(start): url = url_template.format(start) response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') book_list = soup.find_all('li', class_='subject-item') return book_list def get_book_info(book): title = book.find('div', class_='info').a.get_text().strip() rating = book.find('span', class_='rating_nums').get_text().strip() return title, rating if __name__ == '__main__': with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for start in range(0, 100, 20): futures.append(executor.submit(get_book_list, start)) books = [] for future in futures: books.extend(future.result()) futures = [] for book in books: futures.append(executor.submit(get_book_info, book)) for future in futures: title, rating = future.result() print(title, rating)改成正确代码
时间: 2024-04-29 14:19:28 浏览: 136
Python项目开发实战:网站图片抓取工具(案例教程实例课程).pdf
代码中存在一些格式问题,下面是修改后的代码:
```python
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
url_template = 'https://book.douban.com/tag/编程?start={}&type=T'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def get_book_list(start):
url = url_template.format(start)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
book_list = soup.find_all('li', class_='subject-item')
return book_list
def get_book_info(book):
title = book.find('div', class_='info').a.get_text().strip()
rating = book.find('span', class_='rating_nums').get_text().strip()
return title, rating
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for start in range(0, 100, 20):
futures.append(executor.submit(get_book_list, start))
books = []
for future in futures:
books.extend(future.result())
futures = []
for book in books:
futures.append(executor.submit(get_book_info, book))
for future in futures:
title, rating = future.result()
print(title, rating)
```
这段代码使用了多线程的方式爬取豆瓣读书中“编程”标签下的书籍信息。其中,`get_book_list`函数用于获取每一页中的书籍列表,返回一个BeautifulSoup对象的列表;`get_book_info`函数用于从每个书籍的BeautifulSoup对象中提取出书名和评分。最后,使用`ThreadPoolExecutor`类来创建线程池,同时处理多个任务,提高爬取效率。
阅读全文