import requests from bs4 import BeautifulSoup from threading import Thread def crawl_books(start, end): for i in range(start, end): url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i) response = requests.get(url) page = response.text soup = BeautifulSoup(page, 'lxml') books = soup.find('ul', class_='bigimg') for book in books.find_all('li'): title = book.find('a', class_='pic').get('title') author = book.find('p', class_='search_book_author').text price = book.find('p', class_='price').find('span', class_='search_now_price').text book_link = 'https:' + book.find('a', class_='pic').get('href') response = requests.get(book_link) page = response.text soup = BeautifulSoup(page, 'lxml') comment_num = soup.find('a', class_='review_num').text print(title, author, price, comment_num) threads = [] for i in range(1, 101, 10): t = Thread(target=crawl_books, args=(i, i+10)) threads.append(t) t.start() for t in threads: t.join()修改
时间: 2023-12-12 13:04:57 浏览: 46
这段代码是一个简单的爬虫程序,用于爬取当当网上搜索关键词为“Python编程入门”的图书信息,包括书名、作者、价格和评论数量等。代码使用了多线程的方式来提高爬取效率,其中 `crawl_books` 函数用于爬取一页图书信息,通过 `Thread` 类创建多个线程并启动,最后使用 `join` 方法等待所有线程结束。你需要问我一个问题。
相关问题
写出增加爬取书籍评论数的代码import requests from bs4 import BeautifulSoup from threading import Thread def crawl_books(start, end): for i in range(start, end): url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i) response = requests.get(url) page = response.text soup = BeautifulSoup(page, 'lxml') books = soup.find('ul', class_='bigimg') for book in books.find_all('li'): title = book.find('a', class_='pic').get('title') author = book.find('p', class_='search_book_author').text price = book.find('p', class_='price').find('span', class_='search_now_price').text print(title, author, price) threads = [] for i in range(1, 101, 10): t = Thread(target=crawl_books, args=(i, i+10)) threads.append(t) t.start() for t in threads: t.join()
上面的代码只是用来爬取当当网中与搜索关键词“心理学”相关的书籍的名称、作者和价格,而没有爬取书籍评论数。如果需要爬取书籍评论数,可以先获取每本书的详情页链接,然后再进入详情页爬取评论数。具体的代码可以参考下面的示例:
```
import requests
from bs4 import BeautifulSoup
from threading import Thread
def get_book_links(start, end):
links = []
for i in range(start, end):
url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i)
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
books = soup.find('ul', class_='bigimg')
for book in books.find_all('li'):
link = book.find('a', class_='pic').get('href')
links.append(link)
return links
def get_comment_count(url):
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
count = soup.find('a', class_='comm_num').text
return count
def crawl_books(start, end):
links = get_book_links(start, end)
for link in links:
count = get_comment_count(link)
print(link, count)
threads = []
for i in range(1, 101, 10):
t = Thread(target=crawl_books, args=(i, i+10))
threads.append(t)
t.start()
for t in threads:
t.join()
```
上面的代码中,我们新增了两个函数:`get_book_links` 和 `get_comment_count`。`get_book_links` 用来获取每本书的详情页链接,`get_comment_count` 用来获取详情页中的评论数。在 `crawl_books` 函数中,我们先获取每个搜索结果页中的书籍链接,然后再调用 `get_comment_count` 函数获取评论数,并打印出来。最后,我们通过多线程的方式同时爬取多个搜索结果页中的书籍评论数。
增加对书籍销量和书籍日期的爬取import requests from bs4 import BeautifulSoup from threading import Thread def crawl_books(start, end): for i in range(start, end): url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index={}'.format(i) response = requests.get(url) page = response.text soup = BeautifulSoup(page, 'lxml') books = soup.find('ul', class_='bigimg') for book in books.find_all('li'): title = book.find('a', class_='pic').get('title') author = book.find('p', class_='search_book_author').text price = book.find('p', class_='price').find('span', class_='search_now_price').text print(title, author, price) threads = [] for i in range(1, 101, 10): t = Thread(target=crawl_books, args=(i, i+10)) threads.append(t) t.start() for t in threads: t.join()
这段代码看起来是一个爬虫程序,用来爬取当当网上关于某个关键词的图书信息。其中,程序使用了 requests 库来发送 HTTP 请求,使用了 BeautifulSoup 库来解析 HTML 页面,从而获取书名、作者、价格等信息。为了提高效率,程序使用了多线程的方式来并行爬取多个页面。在主函数中,先创建了多个线程,每个线程负责爬取一定范围内的页面,然后启动线程,最后等待所有线程执行完毕。
阅读全文