import requests from bs4 import BeautifulSoup # 爬取的页面数量 num_pages = 5 # 打开每个页面并获取 HTML 内容 for page_num in range(1, num_pages + 1): url = f"https://www.bernama.com/en/archive.php?page={page_num}" response = requests.get(url) html_content = response.text # 解析 HTML 内容,获取新闻标题和内容 soup = BeautifulSoup(html_content, "html.parser") news_list = soup.find_all("div", class_="row news-row") for news in news_list: title = news.find("a").text.strip() link = news.find("a")["href"] news_response = requests.get(link) news_html = news_response.text news_soup = BeautifulSoup(news_html, "html.parser") content_list = news_soup.find_all("div", class_="col-md-12 news-body") content = "\n".join([p.text for p in content_list[0].find_all("p")]) # 将新闻标题和内容保存到 txt 文件 with open(f"page{page_num}_{title}.txt", "w") as f: f.write(f"Title: {title}\n") f.write(f"Link: {link}\n") f.write(f"Content:\n{content}\n")修改这段代码
时间: 2024-03-04 16:49:53 浏览: 73
这段代码是一个Python爬虫,用于从https://www.bernama.com/en/archive.php网站上爬取新闻标题和内容,并将其保存到txt文件中。在修改这段代码之前,需要先确定你想要做什么,比如想要添加或修改哪些功能。以下是一些可能的修改方案:
1. 添加异常处理机制,以防止在爬取页面或解析HTML时出现错误,例如添加try-except块。
2. 添加用户代理,以避免被该网站封锁IP地址。
3. 将txt文件保存到其他文件格式中,例如CSV或JSON。
4. 添加多线程或异步处理,以提高爬取效率。
5. 修改爬取页面的数量或URL,以获取不同的新闻数据。
6. 修改保存的文件名格式或路径,以更好地组织保存的数据。
7. 添加其他功能,例如将新闻数据存储到数据库中,或根据新闻内容进行文本分析等。
相关问题
修改这段代码并写出来import requests from bs4 import BeautifulSoup # 爬取的页面数量 num_pages = 5 # 打开每个页面并获取 HTML 内容 for page_num in range(1, num_pages + 1): url = f"https://www.bernama.com/bm/index.php/archive.php?page={page_num}" response = requests.get(url) html_content = response.text # 解析 HTML 内容,获取新闻标题和内容 soup = BeautifulSoup(html_content, "html.parser") news_list = soup.find_all("div", class_="row news-row") for news in news_list: title = news.find("a").text.strip() link = news.find("a")["href"] news_response = requests.get(link) news_html = news_response.text news_soup = BeautifulSoup(news_html, "html.parser") content_list = news_soup.find_all("div", class_="col-md-12 news-body") content = "\n".join([p.text for p in content_list[0].find_all("p")]) print(content) # 将新闻标题和内容保存到 txt 文件 with open(f"page{page_num}_{title}.txt", "w") as f: f.write(f"Title: {title}\n") f.write(f"Link: {link}\n") f.write(f"Content:\n{content}\n")
import requests
from bs4 import BeautifulSoup
# 爬取的页面数量
num_pages = 5
# 打开每个页面并获取 HTML 内容
for page_num in range(1, num_pages + 1):
url = f"https://www.bernama.com/bm/index.php/archive.php?page={page_num}"
response = requests.get(url)
html_content = response.text
# 解析 HTML 内容,获取新闻标题和内容
soup = BeautifulSoup(html_content, "html.parser")
news_list = soup.find_all("div", class_="row news-row")
for news in news_list:
title = news.find("a").text.strip()
link = news.find("a")["href"]
# 打开每个新闻链接并获取 HTML 内容
news_response = requests.get(link)
news_html = news_response.text
# 解析新闻 HTML 内容,获取新闻内容
news_soup = BeautifulSoup(news_html, "html.parser")
content_list = news_soup.find_all("div", class_="col-md-12 news-body")
content = "\n".join([p.text for p in content_list[0].find_all("p")])
# 将新闻标题和内容保存到 txt 文件
with open(f"page{page_num}_{title}.txt", "w") as f:
f.write(f"Title: {title}\n")
f.write(f"Link: {link}\n")
f.write(f"Content:\n{content}\n")
import requests from bs4 import BeautifulSoup import os import time headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15'} page_count = 0 for page_num in range(1,10000): for i in range(3579989,4580785): url = f" https://www.antaranews.com/berita/{i}/sekjen-puji-indonesia-selenggarakan-ktt-ke-42-asean-dengan-baik?utm_source=antaranews&utm_medium=desktop&utm_campaign=menu_news" res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, "html.parser") div = soup.find("div", {"class": "col-md-8"}) if not div: continue text = div.text file = f"{page_count + 1}.txt" with open(file, "w", encoding="utf-8") as f: f.write(text) print(f"{i} saved successfully.") page_count += 1 if page_count >= 500: break if page_count >= 500: break time.sleep(15) print("All pages saved successfully.")检查这段代码,并修改
这段代码是用来爬取新闻网站文章并保存到本地的,但是有一些问题需要修改。
1. 需要将导入 requests 和 bs4 的代码分别写在两行。
2. 在 url 变量中,链接中有空格,需要去除。
3. 在 range 函数中,第二个参数应该是 4580786,因为 range 函数不包括最后一个数。
4. 在写文件时,文件名应该为 f"{page_count + 1}.txt",而不是 f"{i}.txt",因为每一页可能会保存多篇文章。
5. 在爬取过程中,应该加上异常处理,防止程序因为网络连接问题而中断。
修改后的代码如下:
```
import requests
from bs4 import BeautifulSoup
import os
import time
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15'}
page_count = 0
for page_num in range(1,10000):
for i in range(3579989,4580786):
url = f"https://www.antaranews.com/berita/{i}/sekjen-puji-indonesia-selenggarakan-ktt-ke-42-asean-dengan-baik?utm_source=antaranews&utm_medium=desktop&utm_campaign=menu_news"
try:
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
div = soup.find("div", {"class": "col-md-8"})
if not div:
continue
text = div.text
file = f"{page_count + 1}.txt"
with open(file, "w", encoding="utf-8") as f:
f.write(text)
print(f"{i} saved successfully.")
page_count += 1
if page_count >= 500:
break
time.sleep(15)
except Exception as e:
print(f"Error occurred: {e}")
continue
if page_count >= 500:
break
print("All pages saved successfully.")
```
阅读全文