import re import json import time import requests import datetime import pymysql import selenium from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base from selenium.webdriver import Edge, EdgeOptions # 创建浏览器对象 options = EdgeOptions() options.use_chromium = True options.binary_location = r'C:\Users\邓枫林\PycharmProjects\pythonProject\edgedriver_win64\msedgedriver.exe' browser = Edge(options=options) wait = WebDriverWait(browser, 10) # 打开微博话题页面 url = 'https://weibo.com/n/%E4%B8%AD%E5%9B%BD%E9%A3%9F%E5%93%81%E5%8D%AB%E7%94%9F?from=feed&loc=at&nick=%E4%B8%AD%E5%9B%BD%E9%A3%9F%E5%93%81%E5%8D%AB%E7%94%9F&order=hot' browser.get(url) # 等待页面加载完成 wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") browser = selenium.webdriver.Edge(executable_path='C:/Users/邓枫林/PycharmProjects/pythonProject/edgedriver_win64/msedgedriver.exe') wait = selenium.webdriver.support.ui.WebDriverWait(browser, 10) # 监测页面是否包含“高校类”敏感词汇 if '高校类' in browser.page_source: # 获取原始微博 weibo = browser.find_element_by_css_selector('.WB_feed_detail .WB_text.W_f14').text # 获取转发该微博的用户昵称和转发内容 reposts = [] repost_items = browser.find_elements_by_css_selector('.list_ul .list_li') for item in repost_items: nickname = item.find_element_by_css_selector('.WB_text.W_f14').text content = item.find_element_by_css_selector('.WB_text.W_f14 + .comment_txt').text reposts.append({'nickname': nickname, 'content': content}) # 关闭浏览器 browser.quit() # 将微博和转发内容存入MySQL数据库中 Base = declarative_base() class Weibo(Base): __tablename__ = 'weibo_user' id = Column(Integer, primary_key=True) content = Column(Text) create_time = Column(DateTime) class Repost(Base): __tablename__ = 'weibo_repost' id = Column(Integer, primary_key=True) weibo_id = Column(Integer) nickname = Column(String(50)) content = Column(Text) engine = create_engine('mysql+pymysql://root:root@hostname:port/weibo?charset=utf8mb4') Session = sessionmaker(bind=engine) session = Session() now = datetime.datetime.now() weibo_obj = Weibo(content=weibo, create_time=now) session.add(weibo_obj) session.commit() for repost in reposts: repost_obj = Repost(weibo_id=weibo_obj.id, nickname=repost['nickname'], content=repost['content']) session.add(repost_obj) session.commit() session.close() else: # 关闭浏览器 browser.quit()
时间: 2024-03-26 21:37:18 浏览: 117
这是一个使用 Selenium 和 Python 进行微博爬虫的代码,通过打开指定的微博话题页面,判断页面是否包含“高校类”敏感词汇,如果包含敏感词汇,则获取原始微博和转发该微博的用户昵称和转发内容,并将这些信息存入 MySQL 数据库中。如果不包含敏感词汇,则直接关闭浏览器。在使用代码前,需要安装相关的 Python 库以及浏览器驱动程序,并且要将代码中的数据库连接信息修改为自己的信息。需要注意的是,爬虫行为需要遵守相关法律法规和网站的规定,不得用于非法用途。
相关问题
import requests from bs4 import BeautifulSoup import pymysql import random from selenium import webdriver from selenium.webdriver import ChromeOptions import re import time import requests
这段代码有一个语法错误,应该在 import 语句之间加上换行符。正确的写法如下:
```
import requests
from bs4 import BeautifulSoup
import pymysql
import random
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import re
import time
import requests
```
import re import json import time import requests import datetime import pymysql import selenium from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base from selenium.webdriver import Edge, EdgeOptions # 创建浏览器对象 options = EdgeOptions() options.use_chromium = True options.binary_location = r'C:\Users\邓枫林\PycharmProjects\pythonProject\edgedriver_win64\msedgedriver.exe' browser = Edge(options=options) wait = WebDriverWait(browser, 10) # 打开微博话题页面 url = 'https://weibo.com/n/%E4%B8%AD%E5%9B%BD%E9%A3%9F%E5%93%81%E5%8D%AB%E7%94%9F?from=feed&loc=at&nick=%E4%B8%AD%E5%9B%BD%E9%A3%9F%E5%93%81%E5%8D%AB%E7%94%9F&order=hot' browser.get(url) # 等待页面加载完成 wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") browser = selenium.webdriver.Edge(executable_path='C:/Users/邓枫林/PycharmProjects/pythonProject/edgedriver_win64/msedgedriver.exe') wait = selenium.webdriver.support.ui.WebDriverWait(browser, 10) # 监测页面是否包含“高校类”敏感词汇 if '高校类' in browser.page_source: # 获取原始微博 weibo = browser.find_element_by_css_selector('.WB_feed_detail .WB_text.W_f14').text # 获取转发该微博的用户昵称和转发内容 reposts = [] repost_items = browser.find_elements_by_css_selector('.list_ul .list_li') for item in repost_items: nickname = item.find_element_by_css_selector('.WB_text.W_f14').text content = item.find_element_by_css_selector('.WB_text.W_f14 + .comment_txt').text reposts.append({'nickname': nickname, 'content': content}) # 关闭浏览器 browser.quit() # 将微博和转发内容存入MySQL数据库中 Base = declarative_base() class Weibo(Base): tablename = 'weibo_user' id = Column(Integer, primary_key=True) content = Column(Text) create_time = Column(DateTime) class Repost(Base): tablename = 'weibo_repost' id = Column(Integer, primary_key=True) weibo_id = Column(Integer) nickname = Column(String(50)) content = Column(Text) engine = create_engine('mysql+pymysql://root:root@hostname:port/weibo?charset=utf8mb4') Session = sessionmaker(bind=engine) session = Session() now = datetime.datetime.now() weibo_obj = Weibo(content=weibo, create_time=now) session.add(weibo_obj) session.commit() for repost in reposts: repost_obj = Repost(weibo_id=weibo_obj.id, nickname=repost['nickname'], content=repost['content']) session.add(repost_obj) session.commit() session.close() else: # 关闭浏览器 browser.quit()
这段代码是一个 Python 程序,使用了 Selenium 库模拟浏览器行为,从微博话题页面中获取微博和转发内容,并将它们存入 MySQL 数据库中。其中,程序打开了一个 Microsoft Edge 浏览器对象,等待页面加载完成后进行操作。如果页面包含“高校类”敏感词汇,则获取微博和转发内容,并将它们存入数据库。如果页面不包含该敏感词汇,则直接关闭浏览器。
阅读全文