import requests from bs4 import BeautifulSoup import os import time headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15'} page_count = 0 for page_num in range(1,10000): for i in range(3579989,4580785): url = f" https://www.antaranews.com/berita/{i}/sekjen-puji-indonesia-selenggarakan-ktt-ke-42-asean-dengan-baik?utm_source=antaranews&utm_medium=desktop&utm_campaign=menu_news" res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, "html.parser") div = soup.find("div", {"class": "col-md-8"}) if not div: continue text = div.text file = f"{page_count + 1}.txt" with open(file, "w", encoding="utf-8") as f: f.write(text) print(f"{i} saved successfully.") page_count += 1 if page_count >= 500: break if page_count >= 500: break time.sleep(15) print("All pages saved successfully.")检查这段代码,并修改
时间: 2024-02-14 20:21:15 浏览: 137
python调试文件时发生import requests报错.doc
5星 · 资源好评率100%
这段代码是用来爬取新闻网站文章并保存到本地的,但是有一些问题需要修改。
1. 需要将导入 requests 和 bs4 的代码分别写在两行。
2. 在 url 变量中,链接中有空格,需要去除。
3. 在 range 函数中,第二个参数应该是 4580786,因为 range 函数不包括最后一个数。
4. 在写文件时,文件名应该为 f"{page_count + 1}.txt",而不是 f"{i}.txt",因为每一页可能会保存多篇文章。
5. 在爬取过程中,应该加上异常处理,防止程序因为网络连接问题而中断。
修改后的代码如下:
```
import requests
from bs4 import BeautifulSoup
import os
import time
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15'}
page_count = 0
for page_num in range(1,10000):
for i in range(3579989,4580786):
url = f"https://www.antaranews.com/berita/{i}/sekjen-puji-indonesia-selenggarakan-ktt-ke-42-asean-dengan-baik?utm_source=antaranews&utm_medium=desktop&utm_campaign=menu_news"
try:
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
div = soup.find("div", {"class": "col-md-8"})
if not div:
continue
text = div.text
file = f"{page_count + 1}.txt"
with open(file, "w", encoding="utf-8") as f:
f.write(text)
print(f"{i} saved successfully.")
page_count += 1
if page_count >= 500:
break
time.sleep(15)
except Exception as e:
print(f"Error occurred: {e}")
continue
if page_count >= 500:
break
print("All pages saved successfully.")
```
阅读全文