写一个爬虫当当网书籍数据的爬虫程序
时间: 2023-12-17 20:21:26 浏览: 72
好的,以下是一个爬取当当网的图书数据的示例爬虫程序:
```python
import requests
from bs4 import BeautifulSoup
import csv
def get_book_info(book_url):
"""获取图书信息"""
res = requests.get(book_url)
soup = BeautifulSoup(res.text, 'html.parser')
book_info = {}
book_info['title'] = soup.select_one('.name_info h1').text.strip()
book_info['author'] = soup.select_one('.author span a').text.strip()
book_info['publisher'] = soup.select_one('.publisher_info a').text.strip()
book_info['pub_date'] = soup.select_one('.publisher_info span').text.strip()
book_info['price'] = soup.select_one('.price_info .price_r').text.strip()
return book_info
def crawl_dangdang_books():
"""爬取当当网图书数据"""
# 构造请求头部信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# 构造请求参数信息
params = {
'category_path': '01.00.00.00.00.00',
'sort_type': 'sort_default',
'page_index': '1'
}
# 发送请求获取页面数据
res = requests.get('http://category.dangdang.com/', headers=headers, params=params)
soup = BeautifulSoup(res.text, 'html.parser')
# 获取总页数
total_page = int(soup.select_one('.page .total').text[1:-1])
# 保存图书数据到CSV文件
with open('dangdang_books.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['title', 'author', 'publisher', 'pub_date', 'price']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# 爬取每一页的数据
for page in range(1, total_page+1):
params['page_index'] = str(page)
res = requests.get('http://category.dangdang.com/pg'+str(page)+'-cp01.00.00.00.00.00.html', headers=headers, params=params)
soup = BeautifulSoup(res.text, 'html.parser')
# 爬取每一页的图书数据
book_items = soup.select('.bigimg .pic')
for book_item in book_items:
book_url = book_item.select_one('a')['href']
book_info = get_book_info(book_url)
writer.writerow(book_info)
print('已爬取第 %d 页的数据' % page)
if __name__ == '__main__':
crawl_dangdang_books()
```
此程序使用requests库发送HTTP请求,并使用BeautifulSoup库解析爬取到的HTML页面。它从当当网的图书分类页开始,爬取每一页的图书数据,并将结果保存到CSV文件中。程序中还定义了一个`get_book_info()`函数,用于从图书详情页中提取图书的详细信息。
阅读全文