简单优化这段代码import requests from bs4 import BeautifulSoup from threading import Thread def crawl_books(start, end): for i in range(start, end): url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i) response = requests.get(url) page = response.text soup = BeautifulSoup(page, 'lxml') books = soup.find('ul', class_='bigimg') for book in books.find_all('li'): title = book.find('a', class_='pic').get('title') author = book.find('p', class_='search_book_author').text price = book.find('p', class_='price').find('span', class_='search_now_price').text book_link = 'https:' + book.find('a', class_='pic').get('href') response = requests.get(book_link) page = response.text soup = BeautifulSoup(page, 'lxml') comment_num_tag = soup.find('a', class_='review_num') if comment_num_tag: comment_num = comment_num_tag.text else: comment_num = '0' print(title, author, price, comment_num) threads = [] for i in range(1, 101, 10): t = Thread(target=crawl_books, args=(i, i+10)) threads.append(t) t.start() for t in threads: t.join()
时间: 2024-02-29 08:53:54 浏览: 109
这段代码已经很简洁了,但是可以对程序进行一些小的优化,如下:
1. 由于书籍详情页有可能没有评论数量,因此可以在获取评论数量时添加判断。
2. 为了提高爬取效率,可以使用 `Session` 对象来复用连接,避免多次建立连接。
3. 在爬取时可以设置超时时间,防止程序长时间等待而阻塞。
优化后的代码如下:
```
import requests
from bs4 import BeautifulSoup
from threading import Thread
def crawl_books(start, end):
session = requests.Session()
for i in range(start, end):
url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i)
try:
response = session.get(url, timeout=10)
except requests.exceptions.Timeout:
print('Timeout occurred when accessing: ' + url)
continue
page = response.text
soup = BeautifulSoup(page, 'lxml')
books = soup.find('ul', class_='bigimg')
for book in books.find_all('li'):
title = book.find('a', class_='pic').get('title')
author = book.find('p', class_='search_book_author').text
price = book.find('p', class_='price').find('span', class_='search_now_price').text
book_link = 'https:' + book.find('a', class_='pic').get('href')
try:
response = session.get(book_link, timeout=10)
except requests.exceptions.Timeout:
print('Timeout occurred when accessing: ' + book_link)
continue
page = response.text
soup = BeautifulSoup(page, 'lxml')
comment_num_tag = soup.find('a', class_='review_num')
if comment_num_tag:
comment_num = comment_num_tag.text
else:
comment_num = '0'
print(title, author, price, comment_num)
threads = []
for i in range(1, 101, 10):
t = Thread(target=crawl_books, args=(i, i+10))
threads.append(t)
t.start()
for t in threads:
t.join()
```
这样,我们对程序进行了一些小的优化,代码更加健壮和高效。
阅读全文