from requests_html import HTMLSession import os class Spider: def init(self): self.base_url = 'https://s3-ap-northeast-1.amazonaws.com/data.binance.vision/data/spot/daily/klines' self.pair = '1INCHBTC' self.interval = '1d' self.session = HTMLSession() def get_urls(self): urls = [] # 首页 response = self.session.get(f'{self.base_url}/{self.pair}/{self.interval}/') if response.status_code == 200: for link in response.html.links: if link.endswith('.zip'): urls.append(link) # 分页 while True: response = self.session.get(response.html.links[-1]) if response.status_code != 200: # 请求失败 break for link in response.html.links: if link.endswith('.zip'): urls.append(link) if 'CHECKSUM' in response.html.links[-1]: break return urls def download_files(self): urls = self.get_urls() if not urls: print('下载失败') return if not os.path.exists('download_files'): os.mkdir('download_files') for url in urls: file_name = url.split('/')[-1] file_path = f'/Users/huyang/Desktop/2023/Grach/{file_name}' if os.path.exists(file_path): # 文件已存在 print(f'{file_name} 已存在') continue response = self.session.get(url) if response.status_code != 200: # 请求失败 print(f'{file_name} 下载失败') continue with open(file_path, 'wb') as f: f.write(response.content) print(f'{file_name} 下载成功') def run(self): self.download_files(),将对应的下载数据的网站地址改为'http://www.greenfinance.org.cn/'
时间: 2024-03-31 17:33:59 浏览: 90
requests-html:适用于人类的Pythonic HTML解析:trade_mark:
您需要更改 Spider 类中的以下属性和方法:
```python
class Spider:
def __init__(self):
self.base_url = 'http://www.greenfinance.org.cn/'
self.session = HTMLSession()
def get_urls(self):
urls = []
response = self.session.get(self.base_url)
if response.status_code == 200:
for link in response.html.links:
if link.endswith('.xls') or link.endswith('.xlsx'):
urls.append(link)
return urls
def download_files(self):
urls = self.get_urls()
if not urls:
print('下载失败')
return
if not os.path.exists('download_files'):
os.mkdir('download_files')
for url in urls:
file_name = url.split('/')[-1]
file_path = f'/Users/huyang/Desktop/2023/Grach/{file_name}'
if os.path.exists(file_path):
print(f'{file_name} 已存在')
continue
response = self.session.get(url)
if response.status_code != 200:
print(f'{file_name} 下载失败')
continue
with open(file_path, 'wb') as f:
f.write(response.content)
print(f'{file_name} 下载成功')
def run(self):
self.download_files()
```
这里的 `get_urls()` 方法会获取 http://www.greenfinance.org.cn/ 页面中所有以 `.xls` 或 `.xlsx` 结尾的链接,而 `download_files()` 方法会下载这些链接指向的文件。请注意更改 `file_path` 变量以指向您想要保存文件的位置。最后,您可以通过调用 `run()` 方法来运行该类并下载文件。
阅读全文