请编写python网络爬虫程序,访问网站排行榜https://top.chinaz.com/all/,保存排行榜中前10页的所有内容到本地文件。要求: 1.代码函数化 2.代码中要适当添加注释
时间: 2024-10-12 22:17:48 浏览: 27
```python
# 导入必要的库
import requests
from bs4 import BeautifulSoup
import os
# 函数:获取网页内容并解析
def get_html(url):
"""
获取指定URL的HTML内容
"""
try:
# 使用requests发送GET请求
response = requests.get(url)
# 检查请求状态码,成功返回200
if response.status_code == 200:
return response.text
else:
print(f"Failed to fetch URL {url}, status code: {response.status_code}")
return None
except Exception as e:
print(f"Error occurred while fetching the URL: {e}")
return None
# 函数:解析并提取数据
def parse_content(html):
"""
解析HTML内容,提取页面中的数据
"""
soup = BeautifulSoup(html, 'lxml')
# 查找所有div元素,通常包含排行数据
data_divs = soup.find_all('div', class_='list-item')
# 提取每个div的内容并存储
result = []
for div in data_divs[:10]: # 取前10条数据
title = div.find('a').text
link = div.find('a')['href']
result.append((title, link))
return result
# 函数:保存数据到本地文件
def save_to_file(data, filename):
"""
将数据保存为CSV文件
"""
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["排名", "链接"])
writer.writerows(data)
# 主函数
def main():
base_url = "https://top.chinaz.com/all/"
page_count = 10 # 设置抓取的总页数
# 遍历每一页
for i in range(1, page_count + 1):
url = f"{base_url}?p={i}"
html_content = get_html(url)
if html_content is not None:
data = parse_content(html_content)
file_name = f"rankings_page_{i}.csv"
save_to_file(data, file_name)
print(f"Page {i} processed, data saved to {file_name}")
if __name__ == "__main__":
main()
```
阅读全文