import requests from bs4 import BeautifulSoup import pprint import json import time def download_all_htmls(): """下载所有列表页面的HTML，用于后续的分析""" htmls = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} for idx in range(24): url = f"https://www.utusan.com.my/page/{idx+1}" print("craw html:", url) r = requests.get(url) if r.status_code != 200: raise Exception("error") htmls.append(r.text) with open(f"page{idx+1}.txt", "w", encoding="utf-8") as f: f.write(r.text) if (idx+1) % 20 == 0: print("Sleep for 10 seconds...") time.sleep(10) return htmls htmls = download_all_htmls() for idx, html in enumerate(htmls): soup = BeautifulSoup(html, 'html.parser') articles = soup.find_all('article') for article in articles: title = article.find('h2').get_text().strip() content = article.find('div', {'class': 'field-item even'}).get_text().strip() with open(f"page{idx+1}_{title}.txt", "w", encoding="utf-8") as f: f.write(content)这段代码爬取网站的每一个链接，都只能停留在首页，请改进这段代码，让它能够爬取500个链接的页面

时间: 2024-03-07 09:51:59 浏览: 150

import reimport requestsfrom bs4 import BeautifulSoupimport t

可以将代码改进为循环遍历每一页，并在每一页中获取所有的文章链接，再逐个爬取每个链接的内容，直到爬取500个链接为止。具体实现如下： ``` import requests from bs4 import BeautifulSoup import pprint import json import time def download_all_articles(): """下载所有文章的内容""" articles = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} for idx in range(25): url = f"https://www.utusan.com.my/page/{idx+1}" print("craw html:", url) r = requests.get(url) if r.status_code != 200: raise Exception("error") soup = BeautifulSoup(r.text, 'html.parser') article_links = soup.find_all('a', {'class': 'node-title'}) for link in article_links: article_url = link.get('href') print("craw article:", article_url) r = requests.get(article_url) if r.status_code != 200: raise Exception("error") soup = BeautifulSoup(r.text, 'html.parser') title = soup.find('h1', {'class': 'title'}).get_text().strip() content = soup.find('div', {'class': 'field-item even'}).get_text().strip() articles.append({'title': title, 'content': content}) if len(articles) == 500: return articles time.sleep(1) return articles articles = download_all_articles() pprint.pprint(articles) ``` 这段代码会遍历每一页，并获取每一页中所有文章的链接。然后，依次爬取每个链接的内容，并保存到一个列表中，直到爬取500个链接为止。注意，为了避免过于频繁的访问网站，代码中添加了一秒钟的等待时间。

阅读全文

相关推荐

CSDN会员

开通CSDN年卡参与万元壕礼抽奖

海量 VIP免费资源千本正版电子书商城会员专享价千门课程&专栏

全年可省5,000元立即开通