爬取微博数据的python代码
时间: 2023-10-01 12:08:13 浏览: 24
爬取微博数据需要登录微博账号,因此需要使用第三方库模拟登录。以下是一个爬取微博用户发布的所有微博数据的 Python 代码示例:
```python
# 导入必要的库
import requests
from lxml import etree
import time
import json
# 登录微博账号,获取 cookies
username = 'your_username'
password = 'your_password'
login_url = 'https://passport.weibo.cn/sso/login'
session = requests.session()
login_data = {
'username': username,
'password': password,
'savestate': 1,
'ec': 0,
'pagerefer': '',
'entry': 'mweibo',
'wentry': '',
'loginfrom': '',
'client_id': '',
'code': '',
'qq': '',
'mainpageflag': 1,
'hff': '',
'hfp': ''
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https%3A%2F%2Fm.weibo.cn%2F'
}
login_response = session.post(login_url, data=login_data, headers=headers)
if login_response.status_code == 200:
print('登录成功')
else:
print('登录失败')
# 爬取指定微博用户的所有微博数据
user_id = '123456789' # 微博用户的 ID
weibo_data_url = 'https://m.weibo.cn/profile/info?uid=' + user_id
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://m.weibo.cn/'
}
response = session.get(weibo_data_url, headers=headers)
if response.status_code == 200:
data = json.loads(response.text)['data']
screen_name = data['screen_name'] # 微博用户的昵称
statuses_count = data['statuses_count'] # 微博数
print('微博用户:', screen_name)
print('微博数:', statuses_count)
for i in range(1, int(statuses_count / 10) + 2):
weibo_url = 'https://m.weibo.cn/profile/statuses?uid=' + user_id + '&page=' + str(i)
response = session.get(weibo_url, headers=headers)
if response.status_code == 200:
html = etree.HTML(response.text)
for element in html.xpath('//div[@class="card m-panel card9"]'):
mblog = json.loads(element.xpath('./@data-mid')[0])
created_at = mblog['created_at'] # 微博发布时间
text = mblog['text'] # 微博正文
reposts_count = mblog['reposts_count'] # 转发数
comments_count = mblog['comments_count'] # 评论数
attitudes_count = mblog['attitudes_count'] # 点赞数
print(created_at, text, reposts_count, comments_count, attitudes_count)
time.sleep(2) # 防止爬取过快被封号
else:
print('爬取失败')
```
注:以上代码仅供参考,具体实现需要根据实际情况进行调整。同时,需要注意爬取微博数据涉及到个人隐私和法律问题,请勿将爬取结果用于非法用途。
阅读全文