写出Python爬虫爬取豆瓣top250数据可视化代码
时间: 2023-11-21 10:45:39 浏览: 125
基于python的豆瓣Top250爬虫数据分析可视化(期末大作业).zip
5星 · 资源好评率100%
当然,以下是Python爬虫爬取豆瓣Top250电影数据并进行可视化的代码:
```
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def get_html(url):
try:
user_agent = 'Mozilla/5.0'
headers = {'User-Agent': user_agent}
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def get_movie_info(html):
soup = BeautifulSoup(html, 'html.parser')
movie_list = soup.find('ol', attrs={'class': 'grid_view'})
movie_names = []
movie_ratings = []
movie_votes = []
for movie_item in movie_list.find_all('li'):
movie_name = movie_item.find('span', attrs={'class': 'title'}).get_text()
movie_names.append(movie_name)
movie_rating = float(movie_item.find('span', attrs={'class': 'rating_num'}).get_text())
movie_ratings.append(movie_rating)
movie_vote = movie_item.find(text= '\n \n (人评价)\n ')
movie_vote = int(movie_vote.split('\n')[0].strip())
movie_votes.append(movie_vote)
return movie_names, movie_ratings, movie_votes
def draw_top_250_chart(movie_names, movie_ratings, movie_votes):
# create dataframe
df_movies = pd.DataFrame({'电影名称': movie_names,
'电影评分': movie_ratings,
'观众人数': movie_votes})
# sort by rating
df_movies = df_movies.sort_values(by=['电影评分'], ascending=False)
# top 30 rating movies
top30 = df_movies.head(30)
top30 = top30.iloc[::-1] # reverse sort order
y_pos = np.arange(len(top30['电影名称']))
# draw chart
fig, ax = plt.subplots(figsize=(10, 10))
ax.barh(y_pos, top30['电影评分'], xerr=0.2, align='center', color='blue', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(top30['电影名称'])
ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('电影评分')
ax.set_title('豆瓣Top250高分电影评分排名')
# top 30 voted movies
top30 = df_movies.sort_values(by=['观众人数'], ascending=False).head(30)
top30 = top30.iloc[::-1] # reverse sort order
y_pos = np.arange(len(top30['电影名称']))
# draw chart
fig, ax = plt.subplots(figsize=(10, 10))
ax.barh(y_pos, top30['观众人数'], xerr=0.2, align='center', color='blue', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(top30['电影名称'])
ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('观众人数')
ax.set_title('豆瓣Top250高分电影观众人数排名')
plt.show()
if __name__ == "__main__":
top250_url = 'https://movie.douban.com/top250'
top250_html = get_html(top250_url)
movie_names, movie_ratings, movie_votes = get_movie_info(top250_html)
draw_top_250_chart(movie_names, movie_ratings, movie_votes)
```
这段代码将会从豆瓣网站爬取Top250电影数据,并利用matplotlib进行可视化。希望这可以帮助您!
阅读全文