帮我用python写一个豆瓣电影的爬虫要求以面向对象的方法编写,遍历1到10页存入excel表格中并将单条详情存入word
时间: 2023-05-21 08:05:13 浏览: 114
好的,我可以帮您编写一个豆瓣电影的爬虫。以下是代码示例:
```python
import requests
from bs4 import BeautifulSoup
import openpyxl
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
import os
from docx import Document
class DoubanMovieSpider:
def __init__(self):
self.url = 'https://movie.douban.com/top250'
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
self.movies = []
def get_movie_info(self, url):
res = requests.get(url, headers=self.headers)
soup = BeautifulSoup(res.text, 'html.parser')
# 获取电影名称
movie_name = soup.find('span', property="v:itemreviewed").text.strip()
# 获取电影导演
directors = soup.find_all('a', rel="v:directedBy")
director_list = [director.text for director in directors]
director = ' / '.join(director_list)
# 获取电影类型
types = soup.find_all('span', property="v:genre")
type_list = [type.text for type in types]
movie_type = ' / '.join(type_list)
# 获取电影评分
rating = soup.find('strong', property="v:average").text
# 获取电影评价人数
rating_count = soup.find('span', property="v:votes").text
# 获取电影简介
summary = soup.find('span', property="v:summary").text.strip()
# 将电影信息存入字典
movie_info = {'电影名称': movie_name, '导演': director, '电影类型': movie_type,
'评分': rating, '评价人数': rating_count, '简介': summary}
return movie_info
def get_movies(self):
for i in range(10):
url = self.url + '?start=' + str(i * 25) + '&filter='
res = requests.get(url, headers=self.headers)
soup = BeautifulSoup(res.text, 'html.parser')
movie_list = soup.find_all('div', class_='info')
for movie in movie_list:
# 获取电影详情页链接
detail_url = movie.find('a')['href']
# 获取电影信息
movie_info = self.get_movie_info(detail_url)
self.movies.append(movie_info)
def save_excel(self):
# 存储到Excel中
if not os.path.exists('DoubanMovie.xlsx'):
wb = Workbook()
ws = wb.active
ws.title = 'DoubanMovie'
header = ['电影名称', '导演', '电影类型', '评分', '评价人数', '简介']
for col_num, column_title in enumerate(header, 1):
column_letter = get_column_letter(col_num)
ws['{}1'.format(column_letter)] = column_title
for row_num, row_data in enumerate(self.movies, 2):
for col_num, col_data in enumerate(row_data.values(), 1):
column_letter = get_column_letter(col_num)
ws['{}{}'.format(column_letter, row_num)] = col_data
wb.save('DoubanMovie.xlsx')
else:
wb = openpyxl.load_workbook('DoubanMovie.xlsx')
if len(wb.sheetnames) == 1:
ws = wb.active
ws.title = 'DoubanMovie'
else:
ws = wb.create_sheet(title='DoubanMovie')
for row_num, row_data in enumerate(self.movies, 2):
for col_num, col_data in enumerate(row_data.values(), 1):
column_letter = get_column_letter(col_num)
ws['{}{}'.format(column_letter, row_num)] = col_data
wb.save('DoubanMovie.xlsx')
def save_word(self):
# 存储到Word中
if not os.path.exists('DoubanMovie.docx'):
document = Document()
for movie in self.movies:
document.add_heading(movie['电影名称'], 0)
document.add_paragraph('导演:' + movie['导演'])
document.add_paragraph('类型:' + movie['电影类型'])
document.add_paragraph('评分:' + movie['评分'])
document.add_paragraph('评价人数:' + movie['评价人数'])
document.add_paragraph('简介:' + movie['简介'])
document.add_page_break()
document.save('DoubanMovie.docx')
def run(self):
self.get_movies()
self.save_excel()
self.save_word()
print('电影信息已成功存储到Excel和Word中!')
# 实例化爬虫对象并运行
spider = DoubanMovieSpider()
spider.run()
```
运行后,程序会在当前目录下生成一个名为"DoubanMovie.xlsx"的Excel表格和一个名为"DoubanMovie.docx"的Word文档,其中Excel表格保存了豆瓣电影Top250的电影信息,包括电影名称、导演、电影类型、评分、评价人数、简介等,而Word文档则将每部电影的具体详情以标题/段落的形式呈现。
阅读全文