import requests from bs4 import BeautifulSoup from threading import Thread def crawl_books(start, end): session = requests.Session() for i in range(start, end): url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i) try: response = session.get(url, timeout=10) except requests.exceptions.Timeout: print('Timeout occurred when accessing: ' + url) continue page = response.text soup = BeautifulSoup(page, 'lxml') books = soup.find('ul', class_='bigimg') for book in books.find_all('li'): title = book.find('a', class_='pic').get('title') author = book.find('p', class_='search_book_author').text price = book.find('p', class_='price').find('span', class_='search_now_price').text book_link = 'https:' + book.find('a', class_='pic').get('href') try: response = session.get(book_link, timeout=10) except requests.exceptions.Timeout: print('Timeout occurred when accessing: ' + book_link) continue page = response.text soup = BeautifulSoup(page, 'lxml') comment_num_tag = soup.find('a', class_='review_num') if comment_num_tag: comment_num = comment_num_tag.text else: comment_num = '0' print(title, author, price, comment_num) threads = [] for i in range(1, 101, 10): t = Thread(target=crawl_books, args=(i, i+10)) threads.append(t) t.start() for t in threads: t.join()再优化一次,使评论数量能够得到
时间: 2024-02-29 22:53:44 浏览: 116
可以使用线程池来优化代码,使用`concurrent.futures`模块中的`ThreadPoolExecutor`类来实现线程池。在每个任务中获取评论数量后,将其保存到一个列表中,在所有任务执行完后,统计评论数量的总和。代码如下:
```python
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def crawl_books(start, end):
session = requests.Session()
comments = []
for i in range(start, end):
url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i)
try:
response = session.get(url, timeout=10)
except requests.exceptions.Timeout:
print('Timeout occurred when accessing: ' + url)
continue
page = response.text
soup = BeautifulSoup(page, 'lxml')
books = soup.find('ul', class_='bigimg')
for book in books.find_all('li'):
title = book.find('a', class_='pic').get('title')
author = book.find('p', class_='search_book_author').text
price = book.find('p', class_='price').find('span', class_='search_now_price').text
book_link = 'https:' + book.find('a', class_='pic').get('href')
try:
response = session.get(book_link, timeout=10)
except requests.exceptions.Timeout:
print('Timeout occurred when accessing: ' + book_link)
continue
page = response.text
soup = BeautifulSoup(page, 'lxml')
comment_num_tag = soup.find('a', class_='review_num')
if comment_num_tag:
comment_num = int(comment_num_tag.text)
else:
comment_num = 0
comments.append(comment_num)
print(title, author, price, comment_num)
return comments
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=10) as executor:
results = executor.map(crawl_books, range(1, 101, 10))
comments = [num for result in results for num in result]
total_comments = sum(comments)
print('总评论数:', total_comments)
```
这样可以更加高效地爬取数据,并且得到评论数量的总和。
阅读全文