完整的python代码实现包括添加异常处理、登录微博账号、使用代理等功能以提高爬取稳定性和效率:爬取1000名微博用户信息,包括用户发表的微博内容、关注对象、粉丝等,对爬取内容清洗并提取用户特征,对1000名用户根据爬取的特征进行聚类分析,构建每个类下的用户画像(可通过词云图进行展示),并针对近日的微博热点进行话题推荐。
时间: 2024-04-09 17:32:26 浏览: 87
新浪微博用户数据爬取(Python实现)
以下是一个添加了异常处理、登录微博账号、使用代理的完整Python代码示例:
```python
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 登录微博账号
def login_weibo(username, password):
login_url = 'https://passport.weibo.cn/sso/login'
data = {
'username': username,
'password': password,
'savestate': 1,
'r': '',
'ec': 0,
'pagerefer': '',
'entry': 'mweibo',
'wentry': '',
'loginfrom': '',
'client_id': '',
'code': '',
'qq': '',
'hff': '',
'hfp': ''
}
session = requests.Session()
response = session.post(login_url, data=data)
if response.status_code == 200:
print('登录成功!')
return session
else:
print('登录失败!')
return None
# 爬取用户信息的函数
def crawl_user_info(session, user_id):
url = f'https://weibo.com/u/{user_id}'
try:
response = session.get(url)
response.raise_for_status()
except requests.RequestException as e:
print(f'爬取用户{user_id}信息失败:{str(e)}')
return None
soup = BeautifulSoup(response.text, 'html.parser')
# 解析用户发表的微博内容、关注对象、粉丝等信息
weibo_content = [weibo.text for weibo in soup.select('.WB_text')]
followings = [following.text for following in soup.select('.follow_item_name')]
followers = [follower.text for follower in soup.select('.follow_item_name')]
return {
'user_id': user_id,
'weibo_content': weibo_content,
'followings': followings,
'followers': followers
}
# 使用代理进行爬取
def crawl_with_proxy(session, user_id, proxy):
proxies = {
'http': proxy,
'https': proxy
}
session.proxies.update(proxies)
user_info = crawl_user_info(session, user_id)
# 清除代理设置
session.proxies = {}
return user_info
# 爬取1000名用户信息
def crawl_user_profiles(username, password):
session = login_weibo(username, password)
if session is None:
return []
user_info_list = []
proxy_list = ['proxy1', 'proxy2', 'proxy3'] # 假设这是代理列表
for user_id in range(1, 1001):
try:
user_info = crawl_with_proxy(session, user_id, proxy_list[user_id % len(proxy_list)])
if user_info is not None:
user_info_list.append(user_info)
except Exception as e:
print(f'爬取用户{user_id}信息失败:{str(e)}')
return user_info_list
# 清洗并提取用户特征
def clean_and_extract_features(user_info_list):
corpus = []
for user_info in user_info_list:
weibo_content = ' '.join(user_info['weibo_content'])
corpus.append(weibo_content)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
return X
# 聚类分析
def perform_clustering(X, n_clusters):
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
return kmeans
# 构建用户画像
def build_user_profiles(user_info_list, kmeans, n_clusters):
user_profiles = [[] for _ in range(n_clusters)]
for i, user_info in enumerate(user_info_list):
cluster_label = kmeans.labels_[i]
user_profiles[cluster_label].append(user_info)
return user_profiles
# 生成词云图
def generate_wordclouds(user_profiles):
for i, user_profile in enumerate(user_profiles):
text = ' '.join([weibo_content for user_info in user_profile for weibo_content in user_info['weibo_content']])
wordcloud = WordCloud(background_color='white').generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'Cluster {i+1} User Profile')
plt.show()
# 进行话题推荐
def recommend_topics(user_profiles, hot_topics):
for i, user_profile in enumerate(user_profiles):
topic_scores = []
for user_info in user_profile:
weibo_content = ' '.join(user_info['weibo_content'])
score = sum(weibo_content.count(topic) for topic in hot_topics)
topic_scores.append(score)
top_user_indices = sorted(range(len(topic_scores)), key=lambda k: topic_scores[k], reverse=True)[:5]
top_users = [user_profile[index] for index in top_user_indices]
print(f'Top users in Cluster {i+1}:')
for user_info in top_users:
print(f"User ID: {user_info['user_id']}, Weibo Content: {user_info['weibo_content'][:10]}")
# 主函数
def main():
# 用户名和密码
username = 'your_username'
password = 'your_password'
# 热门话题
hot_topics = ['topic1', 'topic2', 'topic3'] # 假设这是近日的微博热点话题
# 爬取用户信息
user_info_list = crawl_user_profiles(username, password)
# 清洗并提取用户特征
X = clean_and_extract_features(user_info_list)
# 聚类分析
n_clusters = 5 # 设置聚类簇数
kmeans = perform_clustering(X, n_clusters)
# 构建用户画像
user_profiles = build_user_profiles(user_info_list, kmeans, n_clusters)
# 生成词云图
generate_wordclouds(user_profiles)
# 进行话题推荐
recommend_topics(user_profiles, hot_topics)
# 执行主函数
if __name__ == '__main__':
main()
```
注意:在代码中的以下部分,需要替换为你自己的微博账号信息:
```python
# 用户名和密码
username = 'your_username'
password = 'your_password'
```
此外,你还需要将代理列表和热门话题根据需求进行替换。
阅读全文