import requests from bs4 import BeautifulSoup from pprint import pprint import pymysql url = "https://movie.douban.com/subject/30391186/comments?sort=new_score&status=P" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() author = comment.select_one(".comment-info a").get_text().strip() likes = comment.select_one(".votes").get_text().strip() time = comment.select_one(".comment-time").get_text().strip() location = comment.select_one(".comment-location").contents[-1].strip() comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) pprint(comments) db = pymysql.connect(host="localhost", user="root", password="password", database="test") cursor = db.cursor() for comment in comments: sql = "INSERT INTO comments (content, author, likes, time, location) VALUES (%s, %s, %s, %s, %s)" values = (comment["content"], comment["author"], comment["likes"], comment["time"], comment["location"]) cursor.execute(sql, values) db.commit(),我要爬取前五页的内容
时间: 2023-07-19 09:55:33 浏览: 129
好的,你可以使用一个循环来爬取前五页的内容:
```
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import pymysql
db = pymysql.connect(host="localhost", user="root", password="password", database="test")
cursor = db.cursor()
for page in range(5):
url = f"https://movie.douban.com/subject/30391186/comments?start={page*20}&limit=20&sort=new_score&status=P"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" }
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
comments = []
for comment in soup.select(".comment-item"):
content = comment.select_one(".comment-content").get_text().strip()
author = comment.select_one(".comment-info a").get_text().strip()
likes = comment.select_one(".votes").get_text().strip()
time = comment.select_one(".comment-time").get_text().strip()
location = comment.select_one(".comment-location").contents[-1].strip()
comments.append({
"content": content,
"author": author,
"likes": likes,
"time": time,
"location": location
})
pprint(comments)
sql = "INSERT INTO comments (content, author, likes, time, location) VALUES (%s, %s, %s, %s, %s)"
values = (comment["content"], comment["author"], comment["likes"], comment["time"], comment["location"])
cursor.execute(sql, values)
db.commit()
db.close()
```
这段代码会循环5次,每次爬取20条评论,然后将这些评论存储到数据库中。
阅读全文