python爬虫豆瓣读书top250
时间: 2023-06-21 20:08:35 浏览: 91
好的,以下是Python爬虫豆瓣读书TOP250的代码:
```
import requests
from bs4 import BeautifulSoup
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parse_html(html):
soup = BeautifulSoup(html, "html.parser")
book_list = []
for book in soup.find_all('tr', class_='item'):
book_name = book.find('div', class_='pl2').a['title']
book_author = book.find('p', class_='pl').get_text()
book_rating = book.find('span', class_='rating_nums').get_text()
book_pl = book.find('span', class_='pl').get_text()
book_list.append([book_name, book_author, book_rating, book_pl])
return book_list
def print_book_list(book_list):
tplt = "{0:^10}\t{1:{4}^20}\t{2:^10}\t{3:^10}"
print(tplt.format("序号", "书籍名称", "评分", "评价人数", chr(12288)))
for i, book in enumerate(book_list):
print(tplt.format(i + 1, book[0], book[2], book[3], chr(12288)))
def main():
start_url = 'https://book.douban.com/top250?start='
book_list = []
for i in range(0, 250, 25):
url = start_url + str(i)
html = get_html(url)
book_list.extend(parse_html(html))
print_book_list(book_list)
if __name__ == '__main__':
main()
```
这个代码会爬取豆瓣读书TOP250的书籍名称、作者、评分和评价人数,并将结果按照格式输出。你可以直接运行这个代码,或者根据自己的需求对其进行修改。