import requests from bs4 import BeautifulSoup import os headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } total_pages = 20 # 遍历爬取页面 for i in range(2195306, 3000000): url = f"https://www.bernama.com/bm/news.php?id={i}" res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, "html.parser") # 获取需要写入的文本内容 div = soup.find("div", {"class": "col-12 col-sm-12 col-md-12 col-lg-8"}) if not div: print(f"id{i} has no data, skipped.") continue text = div.text # 保存到txt文件 file = str(i) + ".txt" with open(file, "w", encoding="utf-8") as f: f.write(text) print(f"id {i} saved successfully.") print("All pages saved successfully.")修改这段代码,设置如果不存在div就跳过,不需要写进txt代码,下一个存在dive的就写进txt,txt重命名为1~500,txt命名需连续,不能跳过,再加上遍历爬取50个网站就休息10秒的代码
时间: 2024-04-23 22:24:59 浏览: 106
可以将代码修改为以下形式:
```
import requests
from bs4 import BeautifulSoup
import os
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
page_count = 0
for page_num in range(1, 51):
for i in range(2195306, 3000000):
url = f"https://www.bernama.com/bm/news.php?id={i}"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
div = soup.find("div", {"class": "col-12 col-sm-12 col-md-12 col-lg-8"})
if not div:
continue
text = div.text
file = f"{page_count + 1}.txt"
with open(file, "w", encoding="utf-8") as f:
f.write(text)
print(f"id {i} saved successfully.")
page_count += 1
if page_count >= 500:
break
if page_count >= 500:
break
time.sleep(10)
print("All pages saved successfully.")
```
这里通过嵌套循环,先遍历50个网站,再在每个网站中遍历id,如果存在`div`标签则写入txt文件,并将文件命名为1~500。同时,每写入一个文件,就将计数器`page_count`加1,当`page_count`达到500时退出循环。每遍历完一个网站,就休息10秒钟。
阅读全文