python爬取豆瓣top250电影数据 并分析电影类型的代码
时间: 2023-10-02 10:08:11 浏览: 86
好的,以下是相应的Python代码:
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://movie.douban.com/top250'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def get_movie_type(movie_url: str) -> str:
"""获取电影类型"""
r = requests.get(movie_url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
annotations = soup.select('div#info > span[property="v:genre"]')
return [a.string for a in annotations]
def parse_movie_page(movie_url: str) -> dict:
"""解析电影页面信息"""
r = requests.get(movie_url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
name = soup.select('span[property="v:itemreviewed"]')[0].string
rating_num = soup.select('strong[property="v:average"]')[0].string
director = soup.select('a[rel="v:directedBy"]')[0].string
actors = [a.string for a in soup.select('a[rel="v:starring"]')]
genres = get_movie_type(movie_url)
return {'电影名称': name, '评分': rating_num, '导演': director, '主演': actors, '类型': genres}
def get_movies(pages=10):
"""获取多个页面的电影信息"""
movie_list = []
for i in range(pages):
url_index = f'{url}?start={i * 25}&filter='
r = requests.get(url_index, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
movies = soup.select('.grid_view li div.item > a')
for movie in movies:
movie_url = movie.get('href')
movie_info = parse_movie_page(movie_url)
movie_list.append(movie_info)
return movie_list
def analyze_movie_type(movie_list: list):
"""分析所以电影类型"""
result = {}
for movie in movie_list:
types = movie.get('类型')
for t in types:
if t in result:
result[t] += 1
else:
result[t] = 1
return result
if __name__ == '__main__':
# 获取豆瓣前10页的电影信息
movies = get_movies(10)
# 分析电影类型
types = analyze_movie_type(movies)
# 转化为DataFrame
types_df = pd.DataFrame({'类型': list(types.keys()), '数量': list(types.values())})
# 按照数量从大到小排序
types_df = types_df.sort_values(by='数量', ascending=False)
print(types_df)
```
这段代码会爬取豆瓣电影TOP250中前10页的所有电影,然后分析电影的类型,并按照数量从大到小排序,最后输出结果。
阅读全文