将下列代码改为对上海证券交易所网站公告爬取from concurrent.futures import ThreadPoolExecutor import requests headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.159 Safari/537.36" } def download_pdf(url, code, num, date): print(f'开始下载 data/{code}_{date}_{num}.pdf') resp = requests.get(url, headers=headers) with open(f'E:/深交所pdf/{code}_{date}_{num}.pdf', 'wb') as f: f.write(resp.content) resp.close() print(f'E:/深交所pdf/{code}_{date}_{num}.pdf 下载完毕!') if __name__ == '__main__': domain = 'http://www.sse.cn' with ThreadPoolExecutor(30) as t: with open('target.csv', 'r') as f: lines = f.readlines() for line in lines: param = list(line.split()) form = { 'seDate': [param[3], param[3]], 'stock': [param[0]], 'channelCode': ['listedNotice_disc'], 'pageSize': '50', 'pageNum': '1' } # 获取文件列表的url get_file_list_url = 'http://www.sse.com.cn/disclosure/listedinfo/announcement/json/announce_type.json?v=0.9715488799747511' resp = requests.post(get_file_list_url, headers=headers, json=form) # resp.encoding = 'utf-8' # print(resp.json()) js = resp.json() resp.close() tot = 0 for data in js['data']: tot += 1 download_url = domain + f'/api/disc/info/download?id={data["id"]}' t.submit(download_pdf, url=download_url, code=param[0], num=tot, date=param[3]) print("下载完毕!!!") # doc_id = '' # download_url = domain + f'/api/disc/info/download?id={"c998875f-9097-403e-a682-cd0147ce10ae"}' # resp = requests.get(download_url, headers=headers) # with open(f'{"c998875f-9097-403e-a682-cd0147ce10ae"}.pdf', 'wb') as f: # f.write(resp.content) # resp.close()
时间: 2024-02-15 08:27:41 浏览: 114
以下是修改后的代码:
```python
from concurrent.futures import ThreadPoolExecutor
import requests
import json
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.159 Safari/537.36"
}
def download_pdf(url, code, num, date):
print(f'开始下载 data/{code}_{date}_{num}.pdf')
resp = requests.get(url, headers=headers)
with open(f'E:/上交所pdf/{code}_{date}_{num}.pdf', 'wb') as f:
f.write(resp.content)
resp.close()
print(f'E:/上交所pdf/{code}_{date}_{num}.pdf 下载完毕!')
if __name__ == '__main__':
domain = 'http://www.sse.com.cn'
with ThreadPoolExecutor(30) as t:
with open('target.csv', 'r') as f:
lines = f.readlines()
for line in lines:
param = list(line.split())
form = {
'productId': param[0],
'startDate': param[3],
'endDate': param[3],
'reportType2': '',
'reportType': ['DQGG', 'YJYG', 'YJYGL', 'GGLL', 'GGSA'],
'pageHelp.pageSize': '25',
'pageHelp.pageCount': '50',
'pageHelp.pageNo': '1',
'pageHelp.beginPage': '1',
'pageHelp.cacheSize': '1',
'random': '0.006159660767440475'
}
# 获取文件列表的url
get_file_list_url = 'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback93309&\
sqlId=COMMON_SSE_XXPL_YJPL_LB&isPagination=true&pageSize=25&pageHelp.pageSize=25&\
pageHelp.pageCount=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&\
pageHelp.endPage=5&_=1622048792127'
resp = requests.get(get_file_list_url, headers=headers, params=form)
resp.encoding = 'utf-8'
js = json.loads(resp.text[19:-1])
resp.close()
tot = 0
for data in js['result']:
tot += 1
download_url = domain + f'/query/download?filename={data["URL"]}&filetype=pdf&uuid={data["UUID"]}'
t.submit(download_pdf, url=download_url, code=param[0], num=tot, date=param[3])
print("下载完毕!!!")
```
修改的主要内容如下:
1. 修改了爬取网站为上海证券交易所网站公告;
2. 修改了获取文件列表的方式,使用了 GET 请求,并且添加了相应的参数;
3. 修改了下载文件的链接,使其与上海证券交易所网站公告的链接一致;
4. 修改了输出信息,使其更加符合实际情况。
阅读全文