python爬虫京东小米手机
时间: 2023-12-22 10:29:24 浏览: 34
以下是使用Python爬虫京东小米手机的步骤:
1. 导入所需的库:
```python
import requests
from bs4 import BeautifulSoup
import re
import pymysql
import smtplib
from email.mime.text import MIMEText
```
2. 创建一个函数来获取京东手机的大类链接:
```python
def get_category_links():
url = "https://www.jd.com/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
category_links = []
for link in soup.find_all("a", href=re.compile(r"list.jd.com/list.html\?cat=\d+")):
category_links.append(link["href"])
return category_links
```
3. 创建一个函数来获取某个大类下的所有机型链接:
```python
def get_model_links(category_link):
url = "https:" + category_link
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
model_links = []
for link in soup.find_all("a", href=re.compile(r"item.jd.com/\d+.html")):
model_links.append(link["href"])
return model_links
```
4. 创建一个函数来爬取手机的价格和历史价格:
```python
def get_price(model_link):
url = "https:" + model_link
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
price = soup.find("span", class_="p-price").get_text()
history_price = soup.find("div", class_="J-his-price").get_text()
return price, history_price
```
5. 创建一个函数来保存数据到MySQL数据库:
```python
def save_to_mysql(data):
conn = pymysql.connect(host="localhost", user="root", password="password", database="jd_phone")
cursor = conn.cursor()
sql = "INSERT INTO phone (model, price, history_price) VALUES (%s, %s, %s)"
cursor.executemany(sql, data)
conn.commit()
conn.close()
```
6. 创建一个函数来发送邮件通知:
```python
def send_email():
msg = MIMEText("爬虫已完成", "plain", "utf-8")
msg["From"] = "sender@example.com"
msg["To"] = "receiver@example.com"
msg["Subject"] = "爬虫通知"
server = smtplib.SMTP("smtp.example.com", 25)
server.login("username", "password")
server.sendmail("sender@example.com", "receiver@example.com", msg.as_string())
server.quit()
```
7. 调用以上函数来完成爬取和保存数据的过程:
```python
category_links = get_category_links()
model_links = []
for category_link in category_links:
model_links += get_model_links(category_link)
data = []
for model_link in model_links:
price, history_price = get_price(model_link)
data.append((model_link, price, history_price))
save_to_mysql(data)
send_email()
```