生成一个带有GUI前端界面和数据可视化爬虫源码
时间: 2024-03-01 19:54:02 浏览: 156
好的,以下是一个简单的带有GUI前端界面和数据可视化的爬虫源码示例,供您参考:
```python
from tkinter import *
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import pymysql
# 定义GUI界面
class App:
def __init__(self, master):
self.master = master
master.title("爬虫程序")
# 定义GUI界面元素
self.label_url = Label(master, text="网站链接:")
self.label_url.grid(row=0, column=0)
self.entry_url = Entry(master)
self.entry_url.grid(row=0, column=1)
self.label_depth = Label(master, text="爬取深度:")
self.label_depth.grid(row=1, column=0)
self.entry_depth = Entry(master)
self.entry_depth.grid(row=1, column=1)
self.label_interval = Label(master, text="爬取时间间隔:")
self.label_interval.grid(row=2, column=0)
self.entry_interval = Entry(master)
self.entry_interval.grid(row=2, column=1)
self.button_start = Button(master, text="开始爬取", command=self.start_crawler)
self.button_start.grid(row=3, column=0)
self.button_visualize = Button(master, text="数据可视化", command=self.visualize_data)
self.button_visualize.grid(row=3, column=1)
# 爬取网页数据
def crawl(self, url, depth):
visited = set() # 记录已经访问过的链接
unvisited = set([url]) # 记录待访问的链接
data = [] # 记录爬取到的数据
for i in range(depth):
new_urls = set()
for link in unvisited:
try:
response = requests.get(link)
soup = BeautifulSoup(response.content, "html.parser")
title = soup.title.string
data.append({"title": title, "url": link})
visited.add(link)
# 获取网页上的所有链接
for a in soup.find_all("a"):
href = a.get("href")
if href.startswith("http") and href not in visited:
new_urls.add(href)
except:
pass
unvisited = new_urls
return data
# 开始爬虫
def start_crawler(self):
url = self.entry_url.get()
depth = int(self.entry_depth.get())
interval = int(self.entry_interval.get())
data = self.crawl(url, depth)
# 将数据保存到本地文件
df = pd.DataFrame(data)
df.to_csv("data.csv", index=False)
# 将数据保存到数据库
conn = pymysql.connect(host="localhost", user="root", password="123456", db="test", charset="utf8")
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS data (id INT AUTO_INCREMENT PRIMARY KEY, title VARCHAR(255), url VARCHAR(255))")
for item in data:
cursor.execute("INSERT INTO data (title, url) VALUES (%s, %s)", (item["title"], item["url"]))
conn.commit()
cursor.close()
conn.close()
# 提示用户爬虫完成
messagebox.showinfo("爬虫程序", "爬虫完成!")
# 可视化数据
def visualize_data(self):
# 从本地文件读取数据
df = pd.read_csv("data.csv")
# 绘制柱状图
df["title"].value_counts().plot(kind="bar")
plt.title("网页标题统计")
plt.xlabel("网页标题")
plt.ylabel("出现次数")
plt.show()
# 启动GUI界面
root = Tk()
app = App(root)
root.mainloop()
```
在这个示例中,我们使用Python的Tkinter模块创建了一个简单的GUI界面,用户可以在界面上输入要爬取的网站链接、爬取深度和爬取时间间隔。当用户点击“开始爬取”按钮时,程序会使用requests和BeautifulSoup库爬取网页数据,并将数据保存到本地文件和数据库中。当用户点击“数据可视化”按钮时,程序会使用pandas和Matplotlib库进行数据可视化,并在GUI界面上展示出来。
阅读全文