爬取中华古诗词网站python代码
时间: 2024-03-02 12:44:52 浏览: 25
以下是爬取中华古诗词网站的Python代码:
```python
import requests
from bs4 import BeautifulSoup
# 获取诗人信息
def get_poet_info(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
poet_name = soup.select_one('.son1 h1').text
poet_intro = soup.select_one('.son2 p').text
return {'name': poet_name, 'intro': poet_intro}
# 获取诗词详情页链接
def get_detail_links(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
detail_links = [a['href'] for a in soup.select('.typecont a')]
return detail_links
# 获取诗词内容
def get_poem_content(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
poem_title = soup.select_one('.cont h1').text
poem_dynasty = soup.select_one('.cont p a').text
poem_author = soup.select_one('.cont p a:nth-of-type(2)').text
poem_content = soup.select_one('.cont .contson').text.strip()
return {'title': poem_title, 'dynasty': poem_dynasty, 'author': poem_author, 'content': poem_content}
# 爬取诗人的全部诗词
def crawl_poet(url):
poet_info = get_poet_info(url)
print('诗人:', poet_info['name'])
print('简介:', poet_info['intro'])
detail_links = get_detail_links(url)
for link in detail_links:
poem_content = get_poem_content(link)
print('题目:', poem_content['title'])
print('朝代:', poem_content['dynasty'])
print('作者:', poem_content['author'])
print('内容:', poem_content['content'])
print('------------------------')
# 测试
crawl_poet('https://so.gushiwen.cn/authors/authorvsw_4d5dd6f7f59aA1.aspx')
```