使用requests和xpath从搜房网上抓取上海的二手房房源信息(要求获取所有分页上的房源,且每套房源包含标题、楼盘、地点、经纬度、面积、房型、楼层、朝向、建筑年代、单价、总价、经纪人、联系电话等,缺数据的留空)请编写完整代码
时间: 2024-10-11 18:14:05 浏览: 28
首先,你需要安装必要的库,如`requests`, `lxml`(用于解析HTML),以及`beautifulsoup4`(虽然XPath主要用于lxml,但在requests中也可以通过BeautifulSoup支持)。然后按照以下步骤编写Python代码:
```python
import requests
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome, ChromeOptions
# 定义要访问的页面URL,这里假设搜房网的二手房列表页面URL
base_url = "https://sh.fang.com/touch/ershoufang/"
def get_html(url):
# 使用requests获取网页内容
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
return response.text
def parse_page(html):
# 解析HTML内容
soup = BeautifulSoup(html, 'lxml')
# 查找并保存房源信息
houses = []
for item in soup.select('.list-item'):
title = item.select_one('.title').get_text() if item.select('.title') else ''
district = item.select_one('.district').get_text() if item.select('.district') else ''
location = item.select_one('.location').get_text() if item.select('.location') else ''
lat_long = None # 可能需要使用地图API获取经纬度,此处简化略去
area = item.select_one('.area').get_text() if item.select('.area') else ''
house_type = item.select_one('.house-type').get_text() if item.select('.house-type') else ''
floor = item.select_one('.floor').get_text() if item.select('.floor') else ''
orientation = item.select_one('.orientation').get_text() if item.select('.orientation') else ''
year_of_construction = item.select_one('.year-of-construction').get_text() if item.select('.year-of-construction') else ''
price_per_square_meter = item.select_one('.price-per-square-meter').get_text() if item.select('.price-per-square-meter') else ''
total_price = item.select_one('.total-price').get_text() if item.select('.total-price') else ''
agent_name = item.select_one('.agent-name').get_text() if item.select('.agent-name') else ''
phone_number = item.select_one('.phone-number').get_text() if item.select('.phone-number') else ''
houses.append({
'title': title,
'district': district,
'location': location,
'lat_long': lat_long,
'area': area,
'house_type': house_type,
'floor': floor,
'orientation': orientation,
'year_of_construction': year_of_construction,
'price_per_square_meter': price_per_square_meter,
'total_price': total_price,
'agent_name': agent_name,
'phone_number': phone_number,
})
return houses
def fetch_all_houses():
# 获取首页房源,后续分页处理需引入Selenium,因为某些网站可能会加载更多房源需要JavaScript
houses = parse_page(get_html(base_url))
# 需要使用Selenium模拟点击下一页,这里仅提供思路,实际实现依赖于搜房网的具体结构
next_button_xpath = '//a[contains(text(), "下一页")]'
while True:
try:
# 模拟点击下一页
options = ChromeOptions()
driver = Chrome(options=options)
driver.get(base_url)
next_button = driver.find_element_by_xpath(next_button_xpath)
next_button.click()
# 获取新页面的HTML
new_html = driver.page_source
driver.quit()
# 解析新页面
new_houses = parse_page(new_html)
houses.extend(new_houses)
# 如果没有找到“下一页”,则停止循环
if not next_button.is_displayed():
break
except Exception as e:
print(f"Failed to load more pages: {e}")
break
return houses
all_houses = fetch_all_houses()
for house in all_houses:
print(house)
阅读全文