from bs4 import BeautifulSoup import requests import time import pymysql conn = pymysql.connect(host='localhost', user='root', password='123456', charset='utf8') cursor = conn.cursor() cursor.execute('create database wyya;') cursor.execute('use wyya;') create_Tb = 'create table sj(地址 varchar(100),标题 varchar(100),播放量 varchar(50),作者 varchar(50));' cursor.execute(create_Tb) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } for i in range(0, 1505, 35): print(i) time.sleep(2) url = 'https://music.163.com/discover/playlist/?cat=华语&order=hot&limit=35&offset=' + str(i)#修改这里即可 response = requests.get(url=url, headers=headers) html = response.text soup = BeautifulSoup(html, 'html.parser') # 获取包含歌单详情页网址的标签 ids = soup.select('.dec a') # 获取包含歌单索引页信息的标签 lis = soup.select('#m-pl-container li') print(len(lis)) for j in range(len(lis)): # 获取歌单详情页地址 url = ids[j]['href'] # 获取歌单标题 title = ids[j]['title'] # 获取歌单播放量 play = lis[j].select('.nb')[0].get_text() # 获取歌单贡献者名字 user = lis[j].select('p')[1].select('a')[0].get_text() # 输出歌单索引页信息 print(url, title, play, user) insert_Tb = 'insert into sj(地址,标题,播放量,作者) values(%s,%s,%s,%s);' val = (url, title, play, user) cursor.execute(insert_Tb, val) cursor.execute("select *from sj;") conn.commit(); data = cursor.fetchall() for bases in data: print(bases) conn.close()写出优化后的这段代码，使爬取到的所有数据全部存入数据库

解释代码，说明爬虫实现过程：# -- coding:utf8 -- import pymysql import requests import re import pandas as pd from bs4 import BeautifulSoup def get_movies(start): url = "https://movie.douban.com/top250?start=%d&filter=" % start lists = [] headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"} html = requests.get(url,headers=headers) soup = BeautifulSoup(html.content, "html.parser") items = soup.find("ol", class_="grid_view").find_all("li") for i in items: movie = {} movie["rank"] = i.find("em").text movie["link"] = i.find("div","pic").find("a").get("href") movie["mdirecter"]=re.findall(re.compile(r'(.?)',re.S),str(i))[0].replace("...
","").replace("\n ","") movie["name"] = i.find("span", "title").text movie["score"] = i.find("span", "rating_num").text movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" lists.append(movie) return lists if name == "main": db = pymysql.connect(host="localhost",user="root",password="123456",db="maoyan",charset="utf8",port = 3306) cursor = db.cursor() cursor.execute("DROP TABLE IF EXISTS movies") createTab = """CREATE TABLE movies( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, name VARCHAR(20) NOT NULL, link VARCHAR(50) NOT NULL, score VARCHAR(4) NOT NULL, descr VARCHAR(50), directer VARCHAR(100), timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP )""" cursor.execute(createTab) #采集到的数据循环插入数据中 start = 0 while (start < 250): lists = get_movies(start) for i in lists: sql = "INSERT INTO movies(name,link,score,descr,directer) VALUES(%s,%s,%s,%s,%s)" try: cursor.execute(sql, (i["name"], i["link"] , i["score"], i["quote"],i["mdirecter"])) db.commit() print(i["name"]+"...成功插入到数据库中") except: db.rollback() start += 25 db.close() cursor = db.cursor() conn = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='maoyan', charset='utf8mb4') cursor = conn.cursor() #输出评分top10 sql = "select from movies limit 10" db = pd.read_sql(sql, conn) df = db.sort_values(by="score", ascending=False) print(df[['name', 'score']])

1. 导入需要的库，包括pymysql、requests、re、pandas和BeautifulSoup。 2. 定义get_movies函数，该函数使用requests库向豆瓣电影Top250页面发送HTTP请求，然后使用BeautifulSoup解析HTML页面内容，获取电影信息，...

import requests from bs4 import BeautifulSoup from pprint import pprint import pymysql # 定义函数：将评论数据保存到数据库中 def save_to_navicat(comments): # 创建数据库连接 conn = pymysql.connect(host='localhost', user='root', password='root', db='pa', charset='utf8') # 创建游标对象 cursor = conn.cursor() # 插入数据到 MySQL 数据库中 for comment in comments: sql = ''' INSERT INTO comment (content, author, likes, time, location) VALUES (%s, %s, %s, %s, %s) ''' try: # 执行 SQL 插入语句 cursor.execute(sql, (comment['content'], comment['author'], comment['likes'], comment['time'], comment['location'])) # 提交事务 conn.commit() except Exception as e: # 如果发生异常，回滚事务 conn.rollback() print('Insert error:', e) # 关闭游标和连接 cursor.close() conn.close() for page in range(5): url = "https://movie.douban.com/subject/30391186/comments?start={}&limit=20&sort=new_score&status=P".format(page*20) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() author = comment.select_one(".comment-info a").get_text().strip() likes = comment.select_one(".votes").get_text().strip() time = comment.select_one(".comment-time").get_text().strip() location = comment.select_one(".comment-location").contents[-1].strip() comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) print("第{}页的数据：".format(page+1)) pprint(comments)，为什么数据库没有数据，完善代码

在您提供的代码中，缺少了将评论数据保存到数据库的步骤。您需要在获取到评论数据后，调用 save_to_navicat() 函数将数据保存到 MySQL 数据库中。您可以在代码中加入如下代码： # 将获取到的评论数据保存到...

Python基础.zip

from bs4 import BeautifulSoup import requests response = requests.get('http://example.com') soup = BeautifulSoup(response.text, 'html.parser') titles = soup.find_all('h1') # 找到所有的标签 for title ...

Python爬虫入库代码实现.zip

首先，我们需要建立连接conn = sqlite3.connect('database.db')，然后创建游标cursor = conn.cursor()，通过游标执行SQL命令，如cursor.execute('CREATE TABLE...')创建表，cursor.execute('INSERT INTO...')...

Python 爬虫，实时存储到mysql,（注意调整mysql编码），并在前端展示。页面展示需要为html格式，cn.zip

conn = pymysql.connect(host='localhost', user='root', password='password', db='database', charset='utf8mb4') cursor = conn.cursor() create_table_sql = """ CREATE TABLE IF NOT EXISTS web_data ( id...

Python爬虫与数据库交互：B站评论抓取与Pymysql基础

conn = pymysql.connect( host='localhost', user='root', password='your_password', charset='utf8' ) # 创建游标 cursor = conn.cursor() # 创建数据库 cursor.execute("CREATE DATABASE IF NOT EXISTS ...

Python爬虫中的User-Agent问题排查与解决

[Python爬虫中的User-Agent问题排查与解决](https://img-blog.csdnimg.cn/2020111421474049.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2NoZW4xNDE1...

优化这段代码：import requests import pandas as pd from bs4 import BeautifulSoup from lxml import etree import time import pymysql from sqlalchemy import create_engine from urllib.parse import urlencode # 编码 URL 字符串 start_time = time.time() #计算程序运行时间 def get_one_page(i): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } paras = {'reportTime': '2023-03-23', #可以改报告日期，比如2018-6-30获得的就是该季度的信息 'pageNum': i #页码 } url = 'http://s.askci.com/stock/a/?' + urlencode(paras) response = requests.get(url,headers = headers) if response.status_code == 200: return response.text return None except RequestException: print('爬取失败') def parse_one_page(html): soup = BeautifulSoup(html,'lxml') content = soup.select('#myTable04')[0] #[0]将返回的list改为bs4类型 tbl = pd.read_html(content.prettify(),header = 0)[0] # prettify()优化代码,[0]从pd.read_html返回的list中提取出DataFrame tbl.rename(columns = {'序号':'serial_number', '股票代码':'stock_code', '股票简称':'stock_abbre', '公司名称':'company_name', '省份':'province', '城市':'city', '主营业务收入(201712)':'main_bussiness_income', '净利润(201712)':'net_profit', '员工人数':'employees', '上市日期':'listing_date', '招股书':'zhaogushu', '公司财报':'financial_report', '行业分类':'industry_classification', '产品类型':'industry_type', '主营业务':'main_business'},inplace = True) return tbl def generate_mysql(): conn = pymysql.connect( host='localhost', user='root', password='', port=3306, charset = 'utf8', db = 'wade') cursor = conn.cursor() sql = 'CREATE TABLE IF NOT EXISTS listed_company (serial_number INT(20) NOT NULL,stock_code INT(20) ,stock_abbre VARCHAR(20) ,company_name VARCHAR(20) ,province VARCHAR(20) ,city VARCHAR(20) ,main_bussiness_income VARCHAR(20) ,net_profit VARCHAR(20) ,employees INT(20) ,listing_date DATETIME(0) ,zhaogushu VARCHAR(20) ,financial_report VARCHAR(20) , industry_classification VARCHAR(20) ,industry_type VARCHAR(100) ,main_business VARCHAR(200) ,PRIMARY KEY (serial_number))' cursor.execute(sql) conn.close() def write_to_sql(tbl, db = 'wade'): engine = create_engine('mysql+pymysql://root:@localhost:3306/{0}?charset=utf8'.format(db)) try: tbl.to_sql('listed_company2',con = engine,if_exists='append',index=False) # append表示在原有表基础上增加，但该表要有表头 except Exception as e: print(e) def main(page): generate_mysql() for i in range(1,page): html = get_one_page(i) tbl = parse_one_page(html) write_to_sql(tbl) # # 单进程 if name == 'main': main(178) endtime = time.time()-start_time print('程序运行了%.2f秒' %endtime) # 多进程 from multiprocessing import Pool if name == 'main': pool = Pool(4) pool.map(main, [i for i in range(1,178)]) #共有178页 emdtime = time.time()-start_time print('程序运行了%.2f秒' %(time.time()-start_time))

1. 导入模块时可以将相同的模块放在一起，如将requests和pandas放在一起，将BeautifulSoup和lxml放在一起。 2. 导入模块时可以只导入需要的部分，如只导入urlencode函数。 3. 可以将函数或代码段封装成函数或类，使...

import requests from bs4 import BeautifulSoup from pprint import pprint for page in range(5): url = "https://movie.douban.com/subject/30391186/comments?start={}&limit=20&sort=new_score&status=P".format(page*20) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") comments = [] for comment in soup.select(".comment-item"): content = comment.select_one(".comment-content").get_text().strip() author = comment.select_one(".comment-info a").get_text().strip() likes = comment.select_one(".votes").get_text().strip() time = comment.select_one(".comment-time").get_text().strip() location = comment.select_one(".comment-location").contents[-1].strip() comments.append({ "content": content, "author": author, "likes": likes, "time": time, "location": location }) print("第{}页的数据：".format(page+1)) pprint(comments)，我要将数据存入数据库

conn = pymysql.connect(host='localhost', port=3306, user='root', password='password', db='database_name', charset='utf8mb4') 其中，host 是数据库的主机地址，port 是数据库的端口号，user 和 ...

pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'rank INT, name VARCHAR(255), count INT, PRIMARY KEY (id))' at line 1")

conn = pymysql.connect(host='localhost', user='root', password='password', database='mydb', charset='utf8') cursor = conn.cursor() cursor.execute('CREATE TABLE IF NOT EXISTS tourism_ranking (id INT ...

帮我用python爬取下面网页数据到mysql,要用到pymysql模块：https://szzdjc.cnemc.cn:8070/GJZ/Business/Publish/Main.html

conn = pymysql.connect(host='localhost', user='root', password='password', database='test', charset='utf8mb4') cursor = conn.cursor() # 爬取网页数据 url = '...

使用bs4爬取人邮界面的书名、作者、价格、详情链接，并保存到数据库rybs表中

from bs4 import BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') 4. **定位目标元素**：根据网页结构找到包含书名、作者、价格和详情链接的标签，例如<a>, <div>等。 5. **提取...

使用Python编写一个脚本，使用BeautifulSoup库抓取豆瓣电影250的信息（包括电影名称、导演、演员、评分等），并将信息存储到本地数据库中。

conn = pymysql.connect(host=host, user=user, password=password, db=db) cursor = conn.cursor() for movie in movies: title = movie['title'] rating = movie['rating'] director = movie['director'] ...

爬取http://jiuye.scetc.edu.cn/reList招聘信息并写入mysql

conn = pymysql.connect(host='localhost', user='root', password='password', database='test', charset='utf8mb4') cursor = conn.cursor() # 获取数据 items = soup.find_all('div', {'class': 'col-md-12 col-...

1.针对商品销售数据进行爬取及存储;

conn = pymysql.connect(host='localhost', user='root', password='password', database='test', charset='utf8') cursor = conn.cursor() # 获取网页内容 url = 'https://www.example.com/sales' headers = {'...

数学建模学习资料姜启源数学模型课件 M04 数学规划模型共85页.pptx

【大越期货-2024研报】生猪期货早报.pdf

研究报告

相关推荐

import sys import os import urllib from bs4 import BeautifulSoup

import reimport requestsfrom bs4 import BeautifulSoupimport t

Python技术数据抓取方法.docx

Python基础.zip

Python爬虫入库代码实现.zip

Python 爬虫，实时存储到mysql,（注意调整mysql编码），并在前端展示。页面展示需要为html格式，cn.zip

Python爬虫与数据库交互：B站评论抓取与Pymysql基础

Python爬虫中的User-Agent问题排查与解决

pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'rank INT, name VARCHAR(255), count INT, PRIMARY KEY (id))' at line 1")

帮我用python爬取下面网页数据到mysql,要用到pymysql模块：https://szzdjc.cnemc.cn:8070/GJZ/Business/Publish/Main.html

使用bs4爬取人邮界面的书名、作者、价格、详情链接，并保存到数据库rybs表中

使用Python编写一个脚本，使用BeautifulSoup库抓取豆瓣电影250的信息（包括电影名称、导演、演员、评分等），并将信息存储到本地数据库中。

爬取http://jiuye.scetc.edu.cn/reList招聘信息并写入mysql

1.针对商品销售数据进行爬取及存储;

数学建模学习资料 姜启源数学模型课件 M04 数学规划模型 共85页.pptx

【大越期货-2024研报】生猪期货早报.pdf

最新推荐

JHU荣誉单变量微积分课程教案介绍

管理建模和仿真的文件

【实战篇：自定义损失函数】：构建独特损失函数解决特定问题，优化模型性能

如何在ZYNQMP平台上配置TUSB1210 USB接口芯片以实现Host模式，并确保与Linux内核的兼容性？

Naruto爱好者必备CLI测试应用

"互动学习：行动中的多样性与论文攻读经历"

【强化学习损失函数探索】：奖励函数与损失函数的深入联系及优化策略

如何在Springboot后端项目中实现前端的多人视频会议功能，并使用Vue.js与ElementUI进行界面开发？

Android应用显示Ignaz-Taschner-Gymnasium取消课程概览

关系数据表示学习

数学建模学习资料姜启源数学模型课件 M04 数学规划模型共85页.pptx