python爬虫豆瓣电影top250进行可视化代码
时间: 2023-11-28 22:05:35 浏览: 122
以下是使用Python爬取豆瓣电影Top250并进行可视化的代码示例:
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
# 获取电影数据
def get_movies(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
movie_list = soup.find_all('div', class_='info')
movies = []
for movie in movie_list:
hd = movie.find('div', class_='hd')
movie_name = hd.find('span', class_='title').text
bd = movie.find('div', class_='bd')
movie_info = bd.find('p').text
star = bd.find('div', class_='star')
rating_num = star.find('span', class_='rating_num').text
comment_num = star.find_all('span')[3].text[:-3]
quote = bd.find('p', class_='quote').find('span').text
movies.append([movie_name, movie_info, rating_num, comment_num, quote])
return movies
# 将电影数据保存到CSV文件
def save_to_csv(movies):
df = pd.DataFrame(movies, columns=['电影名称', '电影信息', '评分', '评论数', '经典语录'])
df.to_csv('douban_top250.csv', index=False, encoding='utf-8')
# 绘制豆瓣电影Top250评分分布直方图
def plot_rating_distribution():
df = pd.read_csv('douban_top250.csv')
rating_list = df['评分'].tolist()
rating_list = [float(rating) for rating in rating_list]
plt.hist(rating_list, bins=20, facecolor='blue', alpha=0.5)
plt.xlabel('电影评分')
plt.ylabel('数量')
plt.title('豆瓣电影Top250评分分布')
plt.show()
if __name__ == '__main__':
movies = []
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
movies += get_movies(url)
save_to_csv(movies)
plot_rating_distribution()
```
该代码会首先爬取豆瓣电影Top250的数据并保存到CSV文件中,然后绘制评分分布直方图并显示出来。您可以根据需要进行修改和调整。
阅读全文