import requests from bs4 import BeautifulSoup import os headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} total_pages = 20 # 遍历爬取页面 for i in range(2195306, 3000000): url = f"https://www.bernama.com/bm/news.php?id={i}" res = requests.get(url) soup = BeautifulSoup(res.text, "html.parser") # 获取需要写入的文本内容 div = soup.find("div", {"class": "col-12 mt-3 text-dark text-justify"}) text = soup.find("div",{"class": "col-12 col-sm-12 col-md-12 col-lg-8"},).text print(text) if not div: print(f"id{i} has no data, skipped.") else: file = f"{i}.txt" with open(file, "w", encoding="utf-8") as f: f.write(text) continue text = div.text # 保存到txt文件 print(text) # 保存到txt文件 print(f"id {i} saved successfully.") print("All pages saved successfully.")这段代码在运行过程中出现这个错误'NoneType' object has no attribute 'text,请修改该代码,并单独写出一段代码是把获取的数据分别保存到txt中,每个txt命名为1~250的代码
时间: 2024-04-23 10:26:41 浏览: 153
修改代码如下:
```
import requests
from bs4 import BeautifulSoup
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
total_pages = 20
# 遍历爬取页面
for i in range(2195306, 3000000):
url = f"https://www.bernama.com/bm/news.php?id={i}"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
# 获取需要写入的文本内容
div = soup.find("div", {"class": "col-12 mt-3 text-dark text-justify"})
if not div:
print(f"id{i} has no data, skipped.")
continue
text = div.text
# 保存到txt文件
file = str(i) + ".txt"
with open(file, "w", encoding="utf-8") as f:
f.write(text)
print(f"id {i} saved successfully.")
print("All pages saved successfully.")
```
下面是将获取的数据分别保存到txt中,每个txt命名为1~250的代码:
```
import requests
from bs4 import BeautifulSoup
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
total_pages = 20
# 遍历爬取页面
for i in range(1, 251):
url = f"https://www.example.com/page={i}"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
# 获取需要写入的文本内容
divs = soup.find_all("div", {"class": "content"})
for j, div in enumerate(divs):
text = div.text
# 保存到txt文件
file = str(i) + "_" + str(j+1) + ".txt"
with open(file, "w", encoding="utf-8") as f:
f.write(text)
print(f"Page {i}, article {j+1} saved successfully.")
print("All pages saved successfully.")
```
其中,我们假设要爬取的网站为 example.com,每页有多篇文章,每个txt文件的命名格式为“页码_文章序号.txt”,例如第1页第3篇文章的文件名为“1_3.txt”。
阅读全文