python爬虫豆瓣top250详细代码
时间: 2023-10-25 09:07:31 浏览: 98
你好!以下是Python爬虫豆瓣Top250电影的详细代码:
```python
import requests
from bs4 import BeautifulSoup
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
def get_movie_info(url):
html = requests.get(url, headers=header).text
soup = BeautifulSoup(html, 'html.parser')
# 获取电影名称
movie_name = soup.find('span', attrs={'property': 'v:itemreviewed'}).get_text()
# 获取电影评分
movie_score = soup.find('strong', class_='ll rating_num').get_text()
# 获取电影导演、演员列表
credits = soup.find('div', class_='subject clearfix').find_all('a')
directors = []
actors = []
for credit in credits:
if 'rel' in credit.attrs and credit.attrs['rel'][0] == 'v:directedBy':
directors.append(credit.get_text())
elif 'rel' in credit.attrs and credit.attrs['rel'][0] == 'v:starring':
actors.append(credit.get_text())
# 获取电影简介
movie_intro = soup.find('div', class_='indent', id='link-report').span.get_text().strip().replace('\n', '')
# 获取电影其他信息
movie_infos = soup.find('div', id='info').find_all('span')
movie_country = ''
movie_language = ''
movie_runtime = ''
for movie_info in movie_infos:
if '制片国家/地区:' in movie_info.get_text():
movie_country = movie_info.next_sibling.strip().replace('\n', '')
elif '语言:' in movie_info.get_text():
movie_language = movie_info.next_sibling.strip().replace('\n', '')
elif '上映日期:' in movie_info.get_text():
movie_runtime = movie_info.next_sibling.strip().replace('\n', '')
# 输出电影信息
print('电影名称:', movie_name)
print('电影评分:', movie_score)
print('导演:', directors)
print('演员:', actors)
print('制片国家/地区:', movie_country)
print('语言:', movie_language)
print('上映日期:', movie_runtime)
print('电影简介:', movie_intro)
print('\n')
def main():
start = 0
urls = ['https://movie.douban.com/top250?start=' + str(start) for start in range(0, 250, 25)]
for url in urls:
html = requests.get(url, headers=header).text
soup = BeautifulSoup(html, 'html.parser')
movie_hrefs = soup.find_all('div', class_='hd')
for movie_href in movie_hrefs:
movie_url = movie_href.a['href']
get_movie_info(movie_url)
if __name__ == "__main__":
main()
```
请注意,这只是简单的代码示例,如果您要爬取豆瓣Top250电影,请了解相关法律法规,遵守爬虫道德规范,尊重网站规则。
阅读全文