import requests from bs4 import BeautifulSoup from pprint import pprint url = "https://movie.douban.com/subject/30391186/comments?sort=new_score&status=P" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() author = comment.select_one(".comment-info a").get_text().strip() likes = comment.select_one(".votes").get_text().strip() time = comment.select_one(".comment-time").get_text().strip() location = comment.select_one(".comment-location").contents[-1].strip() comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) pprint(comments)，我要爬取前五页的数据

运行这段代码import requests from bs4 import BeautifulSoup import pprint import json def download_all_htmls(): """ 下载所有列表页面的HTML，用于后续的分析 """ htmls=[] for idx in range(24): url=f"https://www.utusan.com.my/page/{idx+1}" print("craw heml:",url) r=requests.get(url) if r.status_code !=250: raise Exception("error") htmls.append(r.text) return htmls htmls=download_all_htmls() htmls[0]，并找出错误优化代码

from bs4 import BeautifulSoup import pprint import json def download_all_htmls(): """下载所有列表页面的HTML，用于后续的分析""" htmls = [] for idx in range(24): url = f...

import requests from bs4 import BeautifulSoup import pprint import json def download_all_htmls(): """下载所有列表页面的HTML，用于后续的分析""" htmls = [] for idx in range(24): url = f"https://www.utusan.com.my/page/{idx+1}" print("craw html:", url) r = requests.get(url) if r.status_code != 200: raise Exception("error") htmls.append(r.text) return htmls htmls = download_all_htmls() print(htmls[0])为这段代码添加一个伪装浏览器爬取，防止反爬代码

from bs4 import BeautifulSoup import pprint import json def download_all_htmls(): """下载所有列表页面的HTML，用于后续的分析""" htmls = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; ...

import requests from bs4 import BeautifulSoup from pprint import pprint import pymysql url = "https://movie.douban.com/subject/30391186/comments?sort=new_score&status=P" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() author = comment.select_one(".comment-info a").get_text().strip() likes = comment.select_one(".votes").get_text().strip() time = comment.select_one(".comment-time").get_text().strip() location = comment.select_one(".comment-location").contents[-1].strip() comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) pprint(comments) db = pymysql.connect(host="localhost", user="root", password="password", database="test") cursor = db.cursor() for comment in comments: sql = "INSERT INTO comments (content, author, likes, time, location) VALUES (%s, %s, %s, %s, %s)" values = (comment["content"], comment["author"], comment["likes"], comment["time"], comment["location"]) cursor.execute(sql, values) db.commit()，我要爬取前五页的内容

url = f"https://movie.douban.com/subject/30391186/comments?start={page*20}&limit=20&sort=new_score&status=P" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ...

import requests from bs4 import BeautifulSoup url = "https://movie.douban.com/subject/30391186/comments?sort=new_score&status=P" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() # 评论内容 author = comment.select_one(".comment-info a").get_text().strip() # 发布人 likes = comment.select_one(".votes").get_text().strip() # 点赞数 time = comment.select_one(".comment-time").get_text().strip() # 时间 location = comment.select_one(".comment-info").contents[-1].strip() # 地点 comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) print(comments)，这段代码，我要输出的格式好看一点

url = "https://movie.douban.com/subject/30391186/comments?sort=new_score&status=P" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/...

为这段代码import requests from bs4 import BeautifulSoup import pprint import json def download_all_htmls(): """下载所有列表页面的HTML，用于后续的分析""" htmls = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} for idx in range(24): url = f"https://www.utusan.com.my/page/{idx+1}" print("craw html:", url) r = requests.get(url) if r.status_code != 200: raise Exception("error") htmls.append(r.text) return htmls htmls = download_all_htmls() print(htmls[0])，添加爬取的每页数据单存保存到txt中，txt命名为页面页码的代码，再添加每隔10s爬取20个页面的代码

from bs4 import BeautifulSoup import pprint import json import time def download_all_htmls(): """下载所有列表页面的HTML，用于后续的分析""" htmls = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows...

import requests from bs4 import BeautifulSoup from pprint import pprint for page in range(5): url = "https://movie.douban.com/subject/30391186/comments?start={}&limit=20&sort=new_score&status=P".format(page*20) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() author = comment.select_one(".comment-info a").get_text().strip() likes = comment.select_one(".votes").get_text().strip() time = comment.select_one(".comment-time").get_text().strip() location = comment.select_one(".comment-location").contents[-1].strip() comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) print("第{}页的数据：".format(page+1)) pprint(comments)，我要将数据存入数据库

import pymysql 2. 连接数据库： python conn = pymysql.connect(host='localhost', port=3306, user='root', password='password', db='database_name', charset='utf8mb4') 其中，host 是数据库的...

import requests from bs4 import BeautifulSoup import pprint import json import time def download_all_htmls(): """下载所有列表页面的HTML，用于后续的分析""" htmls = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} for idx in range(24): url = f"https://www.utusan.com.my/page/{idx+1}" print("craw html:", url) r = requests.get(url) if r.status_code != 200: raise Exception("error") htmls.append(r.text) with open(f"page{idx+1}.txt", "w", encoding="utf-8") as f: f.write(r.text) if (idx+1) % 20 == 0: print("Sleep for 10 seconds...") time.sleep(10) return htmls htmls = download_all_htmls() for idx, html in enumerate(htmls): soup = BeautifulSoup(html, 'html.parser') articles = soup.find_all('article') for article in articles: title = article.find('h2').get_text().strip() content = article.find('div', {'class': 'field-item even'}).get_text().strip() with open(f"page{idx+1}_{title}.txt", "w", encoding="utf-8") as f: f.write(content)这段代码爬取网站的每一个链接，都只能停留在首页，请改进这段代码，让它能够爬取500个链接的页面

from bs4 import BeautifulSoup import pprint import json import time def download_all_articles(): """下载所有文章的内容""" articles = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win...

import requests from bs4 import BeautifulSoup from pprint import pprint import pymysql # 定义函数：将评论数据保存到数据库中 def save_to_navicat(comments): # 创建数据库连接 conn = pymysql.connect(host='localhost', user='root', password='root', db='pa', charset='utf8') # 创建游标对象 cursor = conn.cursor() # 插入数据到 MySQL 数据库中 for comment in comments: sql = ''' INSERT INTO comment (content, author, likes, time, location) VALUES (%s, %s, %s, %s, %s) ''' try: # 执行 SQL 插入语句 cursor.execute(sql, (comment['content'], comment['author'], comment['likes'], comment['time'], comment['location'])) # 提交事务 conn.commit() except Exception as e: # 如果发生异常，回滚事务 conn.rollback() print('Insert error:', e) # 关闭游标和连接 cursor.close() conn.close() for page in range(5): url = "https://movie.douban.com/subject/30391186/comments?start={}&limit=20&sort=new_score&status=P".format(page*20) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() author = comment.select_one(".comment-info a").get_text().strip() likes = comment.select_one(".votes").get_text().strip() time = comment.select_one(".comment-time").get_text().strip() location = comment.select_one(".comment-location").contents[-1].strip() comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) print("第{}页的数据：".format(page+1)) pprint(comments)，为什么数据库没有数据，完善代码

在您提供的代码中，缺少了...save_to_navicat(comments) 这样，获取到的评论数据就会被保存到 MySQL 数据库中了。同时，需要注意数据库的配置是否正确，包括主机名、用户名、密码、数据库名等信息都需要正确配置。

输出的时候显示一下是第几页的数据

url = "https://movie.douban.com/subject/30391186/comments?start={}&limit=20&sort=new_score&status=P".format(page*20) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit...

分成两段逻辑，一段获取到数据，一段放入数据库

url = "https://movie.douban.com/subject/30391186/comments?start={}&limit=20&sort=new_score&status=P".format(page*20) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit...

python google库

google库使用requests和BeautifulSoup库来实现爬取Google搜索结果的功能。你可以使用google库的search函数来进行搜索，并根据需要返回特定数量的结果。例如，如果你想要在Google上搜索"高效码农"并返回100个结果，...

ofborg：@ofborg工具自动化https：//monitoring.nix.cidashboarddbofborg

示例提交标题及其将开始的构建：信息自动构建vim: 1.0.0 -> 2.0.0 vim vagrant: Fix dependencies for version 2.0.2 vagrant python36Packages.requests,python27Packages.requests: 1.0.0 -> 2.0.0 python36...

http://python-requests.org/库的透明持久缓存-Python开发

'demo_cache'）并且所有带有标头和cookie的响应都将透明地缓存到demo_cache.sqlite数据库中，例如，以下代码将只用1-2秒而不是10秒，并在下次启动时立即运行：for range in 10）：requests.get（'...

leetcode中国-Leetcode-crawler:一个简单的python爬虫/解析器，用于从https://leetcode.com下载

beautifulsoup4 请求 -- pip install requests 兼容性在 python2.7 或更高版本下工作。对于平台，它适用于 Mac OS X 和 Linux (Debian)，不确定它在 Windows 上是否表现良好（有一些代码处理目录和文件，我不知道 ...

python requests.get带header

主要介绍了python requests.get带heade方式，具有很好的参考价值，希望对大家有所帮助。一起跟随小编小编过来看看吧

python爬虫代码（requests、BeautifulSoup）.docx

requests+lxml/etree+xpath表达式 requests+BeautifulSoup+find_all进行信息提取

相关推荐

java坑爹的笔试题-gitee-bullshit-codes:从https://gitee.com/oschina/bullshit-cod

import reimport requestsfrom bs4 import BeautifulSoupimport t

下载社会学相关公开数据的简单爬虫 http://www.dingxing.gov.cn/czyslist-394-more.

输出的时候显示一下是第几页的数据

分成两段逻辑，一段获取到数据，一段放入数据库

python google库

ofborg：@ofborg工具自动化https：//monitoring.nix.cidashboarddbofborg

http://python-requests.org/库的透明持久缓存-Python开发

leetcode中国-Leetcode-crawler:一个简单的python爬虫/解析器，用于从https://leetcode.com下载

python requests.get带header

python爬虫代码（requests、BeautifulSoup）.docx

最新推荐

Java swing + socket + mysql 五子棋网络对战游戏FiveChess.zip

zigbee-cluster-library-specification

管理建模和仿真的文件

实现实时数据湖架构：Kafka与Hive集成

用 Python 画一个可以动的爱心

JSBSim Reference Manual

"互动学习：行动中的多样性与论文攻读经历"

实现实时监控告警系统：Kafka与Grafana整合

c++校园超市商品信息管理系统课程设计说明书(含源代码) (2).pdf