import csv import requests from lxml import etree #定义主程序入口 if name == "main": # 定义要爬取的网页链接 url = 'https://jn.lianjia.com/zufang/pg%d' # 请求头 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57' } # 爬取源码数据 # 定义要爬取的数据字段 headers = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"] path = "链家济南租房数据.csv" #打开CSV文件,使用csv.writer()函数创建一个csv.writer对象,用于写入CSV文件 with open(path, 'w', newline='') as f: csv_write = csv.writer(f) csv_head = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"] csv_write.writerow(csv_head) # 用于循环爬取多个网页,其中range(85,101)表示需要爬取85-100页的租房信息 for num in range(85,101): # 使用字符串的format()方法将%d占位符替换成实际的页码num new_url = url.format(num) # 用request库获取网页源码数据 page_num1 = requests.get(url=new_url, headers=header) page_num1.encoding = ('utf-8') page_num = page_num1.text tree = etree.HTML(page_num) # # 使用xpath语法获取所有租房信息的父级标签对象 div_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div') with open(path, 'a', newline='') as f: # 循环处理每个租房信息的标签对象 csv_write = csv.writer(f) for div in div_list: quyu = div.xpath('./div/p[2]/a[1]//text()')[0] name = div.xpath('./div/p[2]/a[3]//text()')[0] area_text = div.xpath('./div/p[2]/text()')[4] #用来解决.split()出错 area = area_text.split()[0] if len(area_text.split()) > 0 else "" type = div.xpath('./div/p[2]/text()')[6].split()[0] place = div.xpath('./div//span/em/text()')[0] data_row = [quyu, name, type, area, place] csv_write.writerow(data_row) print("完成")改成先获取网页数爬网页数据
时间: 2024-04-01 07:35:20 浏览: 148
# 第一个爬虫示例,爬取百度页面 import requests #导入爬虫的库,不然调用不了爬虫的函数 response =
要先获取网页数,可以通过访问链家网站的租房列表页面,获取页面中的总页数信息,然后再根据总页数进行循环爬取每一页的租房信息。具体实现可以参考以下代码:
```python
import csv
import requests
from lxml import etree
# 定义要爬取的网页链接
url = 'https://jn.lianjia.com/zufang/pg{}'
# 请求头
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
}
# 获取总页数
page_num1 = requests.get(url=url.format(1), headers=header)
page_num1.encoding = ('utf-8')
page_num = page_num1.text
tree = etree.HTML(page_num)
total_page = int(tree.xpath('//*[@id="content"]/div[1]/div[2]/div/@data-totalpage')[0])
# 定义要爬取的数据字段
headers = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"]
path = "链家济南租房数据.csv"
# 打开CSV文件,使用csv.writer()函数创建一个csv.writer对象,用于写入CSV文件
with open(path, 'w', newline='') as f:
csv_write = csv.writer(f)
csv_head = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"]
csv_write.writerow(csv_head)
# 循环爬取每一页的租房信息
for num in range(1, total_page+1):
new_url = url.format(num)
page_num1 = requests.get(url=new_url, headers=header)
page_num1.encoding = ('utf-8')
page_num = page_num1.text
tree = etree.HTML(page_num)
div_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div')
with open(path, 'a', newline='') as f:
csv_write = csv.writer(f)
for div in div_list:
quyu = div.xpath('./div/p[2]/a[1]//text()')[0]
name = div.xpath('./div/p[2]/a[3]//text()')[0]
area_text = div.xpath('./div/p[2]/text()')[4]
area = area_text.split()[0] if len(area_text.split()) > 0 else ""
type = div.xpath('./div/p[2]/text()')[6].split()[0]
place = div.xpath('./div//span/em/text()')[0]
data_row = [quyu, name, type, area, place]
csv_write.writerow(data_row)
print("第{}页完成".format(num))
print("全部完成")
```
这样就可以先获取总页数,再循环爬取每一页的租房信息,将数据写入到CSV文件中。
阅读全文