from requests_html import HTMLSession import os class Spider: def __init__(self): self.base_url = 'https://s3-ap-northeast-1.amazonaws.com/data.binance.vision/data/spot/daily/klines' self.pair = '1INCHBTC' self.interval = '1d' self.session = HTMLSession() def get_urls(self): urls = [] # 首页 response = self.session.get(f'{self.base_url}/{self.pair}/{self.interval}/') if response.status_code == 200: for link in response.html.links: if link.endswith('.zip'): urls.append(link) # 分页 while True: response = self.session.get(response.html.links[-1]) if response.status_code != 200: # 请求失败 break for link in response.html.links: if link.endswith('.zip'): urls.append(link) if 'CHECKSUM' in response.html.links[-1]: break return urls def download_files(self): urls = self.get_urls() if not urls: print('下载失败') return if not os.path.exists('download_files'): os.mkdir('download_files') for url in urls: file_name = url.split('/')[-1] file_path = f'download_files/{file_name}' if os.path.exists(file_path): # 文件已存在 print(f'{file_name} 已存在') continue response = self.session.get(url) if response.status_code != 200: # 请求失败 print(f'{file_name} 下载失败') continue with open(file_path, 'wb') as f: f.write(response.content) print(f'{file_name} 下载成功') def run(self): self.download_files()
时间: 2024-02-10 16:18:53 浏览: 279
这是一个 Python 爬虫程序,用于从 Binance 网站下载 1INCHBTC 在 1 天内的 K 线数据。程序使用了 requests_html 库来处理 HTTP 请求和解析 HTML。在 Spider 类中,构造函数 __init__ 定义了程序需要的一些基础参数,如 Binance 的数据下载链接、交易对、时间间隔、HTTP 会话等。方法 get_urls 用来获取数据文件的下载链接列表,方法 download_files 用来下载这些数据文件,如果文件已存在则不会重复下载。最后,在 run 方法中调用了 download_files 方法来运行整个程序。
相关问题
解释代码from requests_html import HTMLSession import os class Spider: def __init__(self): self.base_url = 'https://s3-ap-northeast-1.amazonaws.com/data.binance.vision/data/spot/daily/klines' self.pair = '1INCHBTC' self.interval = '1d' self.session = HTMLSession() def get_urls(self): urls = [] # 首页 response = self.session.get(f'{self.base_url}/{self.pair}/{self.interval}/') if response.status_code == 200: for link in response.html.links: if link.endswith('.zip'): urls.append(link) # 分页 while True: response = self.session.get(response.html.links[-1]) if response.status_code != 200: # 请求失败 break for link in response.html.links: if link.endswith('.zip'): urls.append(link) if 'CHECKSUM' in response.html.links[-1]: break return urls def download_files(self): urls = self.get_urls() if not urls: print('下载失败') return if not os.path.exists('download_files'): os.mkdir('download_files') for url in urls: file_name = url.split('/')[-1] file_path = f'/Users/huyang/Desktop/2023/Grach/{file_name}' if os.path.exists(file_path): # 文件已存在 print(f'{file_name} 已存在') continue response = self.session.get(url) if response.status_code != 200: # 请求失败 print(f'{file_name} 下载失败') continue with open(file_path, 'wb') as f: f.write(response.content) print(f'{file_name} 下载成功') def run(self): self.download_files()
这段代码定义了一个名为Spider的类,该类包含了获取URL和下载文件的方法。在初始化时,会设置一些基本的参数,如base_url、pair和interval等。get_urls()方法用于获取文件下载链接,它首先访问base_url拼接上pair和interval参数后的URL,然后从响应的html中提取出所有以".zip"结尾的链接,将它们添加到urls列表中。如果该页面有分页,就会继续访问下一页,直到最后一页或者访问失败。download_files()方法则是用来下载文件的,它首先调用get_urls()方法获取下载链接列表,然后在本地创建一个download_files目录(如果不存在的话),遍历所有的链接,下载链接中的文件到指定路径中,并返回下载结果。最后,run()方法调用download_files()方法来执行下载任务。
from requests_html import HTMLSession import os class Spider: def init(self): self.base_url = 'https://s3-ap-northeast-1.amazonaws.com/data.binance.vision/data/spot/daily/klines' self.pair = '1INCHBTC' self.interval = '1d' self.session = HTMLSession() def get_urls(self): urls = [] # 首页 response = self.session.get(f'{self.base_url}/{self.pair}/{self.interval}/') if response.status_code == 200: for link in response.html.links: if link.endswith('.zip'): urls.append(link) # 分页 while True: response = self.session.get(response.html.links[-1]) if response.status_code != 200: # 请求失败 break for link in response.html.links: if link.endswith('.zip'): urls.append(link) if 'CHECKSUM' in response.html.links[-1]: break return urls def download_files(self): urls = self.get_urls() if not urls: print('下载失败') return if not os.path.exists('download_files'): os.mkdir('download_files') for url in urls: file_name = url.split('/')[-1] file_path = f'/Users/huyang/Desktop/2023/Grach/{file_name}' if os.path.exists(file_path): # 文件已存在 print(f'{file_name} 已存在') continue response = self.session.get(url) if response.status_code != 200: # 请求失败 print(f'{file_name} 下载失败') continue with open(file_path, 'wb') as f: f.write(response.content) print(f'{file_name} 下载成功') def run(self): self.download_files(),将对应的下载数据的网站地址改为'http://www.greenfinance.org.cn/'
您需要更改 Spider 类中的以下属性和方法:
```python
class Spider:
def __init__(self):
self.base_url = 'http://www.greenfinance.org.cn/'
self.session = HTMLSession()
def get_urls(self):
urls = []
response = self.session.get(self.base_url)
if response.status_code == 200:
for link in response.html.links:
if link.endswith('.xls') or link.endswith('.xlsx'):
urls.append(link)
return urls
def download_files(self):
urls = self.get_urls()
if not urls:
print('下载失败')
return
if not os.path.exists('download_files'):
os.mkdir('download_files')
for url in urls:
file_name = url.split('/')[-1]
file_path = f'/Users/huyang/Desktop/2023/Grach/{file_name}'
if os.path.exists(file_path):
print(f'{file_name} 已存在')
continue
response = self.session.get(url)
if response.status_code != 200:
print(f'{file_name} 下载失败')
continue
with open(file_path, 'wb') as f:
f.write(response.content)
print(f'{file_name} 下载成功')
def run(self):
self.download_files()
```
这里的 `get_urls()` 方法会获取 http://www.greenfinance.org.cn/ 页面中所有以 `.xls` 或 `.xlsx` 结尾的链接,而 `download_files()` 方法会下载这些链接指向的文件。请注意更改 `file_path` 变量以指向您想要保存文件的位置。最后,您可以通过调用 `run()` 方法来运行该类并下载文件。
阅读全文