import requests from bs4 import BeautifulSoup def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} try: response = requests.get(url, headers=headers) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except: return "" def parse_html(html): soup = BeautifulSoup(html, 'html.parser') book_list = soup.find_all('li', class_='subject-item') for book in book_list: title = book.find('div', class_='info').find('a')['title'] author_info = book.find('div', class_='pub').get_text().split('/') author = author_info[0].strip() publisher = author_info[-3].strip() print('书名:', title) print('作者:', author) print('出版社:', publisher) print('--------------------------') if __name__ == '__main__': for i in range(0,1000,20): url = 'https://book.douban.com/tag/%E5%8E%86%E5%8F%B2?start='+str(i)+'&type=T' html = get_html(url) parse_html(html)输出结果转为excle表格
时间: 2024-04-27 17:19:16 浏览: 161
好的,你可以使用pandas库将输出的结果转换成Excel表格。以下是修改后的代码:
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
return ""
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
book_list = soup.find_all('li', class_='subject-item')
data = []
for book in book_list:
title = book.find('div', class_='info').find('a')['title']
author_info = book.find('div', class_='pub').get_text().split('/')
author = author_info[0].strip()
publisher = author_info[-3].strip()
data.append([title, author, publisher])
return data
if __name__ == '__main__':
data = []
for i in range(0, 1000, 20):
url = 'https://book.douban.com/tag/%E5%8E%86%E5%8F%B2?start=' + str(i) + '&type=T'
html = get_html(url)
data += parse_html(html)
# 将数据转换成DataFrame,并输出到Excel表格
df = pd.DataFrame(data, columns=['书名', '作者', '出版社'])
df.to_excel('history_books.xlsx', index=False)
```
在这段代码中,我们在`parse_html`函数中将每本书的信息存储在一个列表中,然后在主函数中将所有列表合并成一个大列表。最后,使用pandas库将这个大列表转换成DataFrame,并输出到Excel表格中。输出的Excel表格文件名为`history_books.xlsx`,不包含行索引。
阅读全文