写一个python爬虫爬去喜马拉雅
时间: 2023-12-31 10:22:37 浏览: 155
Python 制作爬取喜马拉雅程序源码
以下是一个简单的Python爬虫示例,用于爬取喜马拉雅音频数据:
```python
import requests
from bs4 import BeautifulSoup
import json
import csv
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# 获取网页内容
def get_html(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return None
# 解析网页内容
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
# 获取音频数据
audio_list = soup.find_all('div', class_='sound-list _is')
data_list = []
for audio in audio_list:
data = {}
data['title'] = audio.find('a', class_='title')['title']
data['play_count'] = audio.find('span', class_='nb').get_text()
data['duration'] = audio.find('span', class_='duration').get_text()
data_list.append(data)
return data_list
# 爬取数据
def crawl_data():
data_list = []
for i in range(1, 11):
url = 'https://www.ximalaya.com/yinyue/{}/'.format(i)
html = get_html(url)
data_list += parse_html(html)
return data_list
# 将数据保存为json文件
def save_json(data_list):
with open('data_json.json', 'w', encoding='utf-8') as f:
json.dump(data_list, f, ensure_ascii=False, indent=4)
print('json文件保存成功')
# 将数据保存为csv文件
def save_csv(data_list):
# 表头
title = data_list[0].keys()
with open('data_csv.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, title)
# 写入表头
writer.writeheader()
# 批量写入数据
writer.writerows(data_list)
print('csv文件保存成功')
if __name__ == '__main__':
data_list = crawl_data()
save_json(data_list)
save_csv(data_list)
```
阅读全文