import requests from bs4 import BeautifulSoup import os headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } total_pages = 20 # 遍历爬取页面 for i in range(2195306, 3000000): url = f"https://www.bernama.com/bm/news.php?id={i}" res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, "html.parser") # 获取需要写入的文本内容 div = soup.find("div", {"class": "col-12 col-sm-12 col-md-12 col-lg-8"}) if not div: print(f"id{i} has no data, skipped.") continue text = div.text # 保存到txt文件 file = str(i) + ".txt" with open(file, "w", encoding="utf-8") as f: f.write(text) print(f"id {i} saved successfully.") print("All pages saved successfully.")修改这段代码,设置如果不存在div就跳过,不需要写进txt代码
时间: 2024-04-23 16:25:11 浏览: 109
import reimport requestsfrom bs4 import BeautifulSoupimport t
可以将代码中的以下部分:
```
if not div:
print(f"id{i} has no data, skipped.")
continue
```
修改为:
```
div = soup.find("div", {"class": "col-12 col-sm-12 col-md-12 col-lg-8"})
if div:
text = div.text
file = str(i) + ".txt"
with open(file, "w", encoding="utf-8") as f:
f.write(text)
print(f"id {i} saved successfully.")
else:
print(f"id{i} has no data, skipped.")
continue
```
这样就可以在不存在`div`的情况下直接跳过,不需要将空文件写入txt了。
阅读全文