python爬取链家二手房信息400条,并存入house.xlsx表格中
时间: 2024-04-01 22:35:20 浏览: 52
好的,我可以为您提供以下代码来实现这个爬取任务。不过需要注意的是,网站的反爬虫机制比较严格,如果爬取过于频繁会被封IP,建议加上一些延时等措施。
```
import requests
import openpyxl
from bs4 import BeautifulSoup
import time
url = "https://bj.lianjia.com/ershoufang/pg"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = "house"
sheet['A1'] = '小区名称'
sheet['B1'] = '户型'
sheet['C1'] = '面积'
sheet['D1'] = '朝向'
sheet['E1'] = '楼层'
sheet['F1'] = '年代'
sheet['G1'] = '总价(万)'
sheet['H1'] = '单价(元/平米)'
row = 2
for i in range(1, 21):
print("正在爬取第{}页数据".format(i))
res = requests.get(url + str(i), headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
house_list = soup.find_all('div', {'class': 'info clear'})
for house in house_list:
name = house.find('div', {'class': 'title'}).a.get_text()
room = house.find('div', {'class': 'houseInfo'}).get_text().split('|')[1].strip()
square = house.find('div', {'class': 'houseInfo'}).get_text().split('|')[2].strip()
direction = house.find('div', {'class': 'houseInfo'}).get_text().split('|')[3].strip()
floor = house.find('div', {'class': 'positionInfo'}).get_text().split('-')[1].strip()
year = house.find('div', {'class': 'positionInfo'}).get_text().split('-')[0].strip()
total_price = house.find('div', {'class': 'totalPrice'}).span.get_text()
unit_price = house.find('div', {'class': 'unitPrice'}).get_text().strip()[2:-4]
sheet.cell(row=row, column=1, value=name)
sheet.cell(row=row, column=2, value=room)
sheet.cell(row=row, column=3, value=square)
sheet.cell(row=row, column=4, value=direction)
sheet.cell(row=row, column=5, value=floor)
sheet.cell(row=row, column=6, value=year)
sheet.cell(row=row, column=7, value=total_price)
sheet.cell(row=row, column=8, value=unit_price)
row += 1
time.sleep(1)
wb.save('house.xlsx')
print("数据爬取完成!")
```
阅读全文