import requests from bs4 import BeautifulSoup def get_total_count(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36' } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') total_count = soup.find('h2', class_='total fl').find('span').get_text() return int(total_count) def crawl_lianjia(): districts = ['wanzhou', 'yubei', 'jiangbei', 'shapingba', 'jiulongpo'] base_url = 'https://cq.lianjia.com/ershoufang/{}/pg{}' total_counts = {} for district in districts: count = 0 for page in range(1, 10): url = base_url.format(district, page) count += get_total_count(url) total_counts[district] = count return total_counts result = crawl_lianjia() print(result)修改代码只爬取近一年的
时间: 2024-04-08 22:35:50 浏览: 128
import reimport requestsfrom bs4 import BeautifulSoupimport t
好的,我已经更新了代码,使其只爬取链家网站上重庆地区万州、渝北、江北、沙坪坝、纪录片地区近一年内的二手房销售总数。请注意,由于链家网站的限制,我们无法直接获取近一年的数据,而是通过限制爬取的页面数量来近似实现。以下是更新后的代码:
```python
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
def get_total_count(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
total_count = soup.find('h2', class_='total fl').find('span').get_text()
return int(total_count)
def crawl_lianjia():
districts = ['wanzhou', 'yubei', 'jiangbei', 'shapingba', 'jiulongpo']
base_url = 'https://cq.lianjia.com/ershoufang/{}/pg{}'
total_counts = {}
for district in districts:
count = 0
today = datetime.now().date()
one_year_ago = today - timedelta(days=365)
for page in range(1, 10):
url = base_url.format(district, page)
if page == 1:
url += '/c5011000010899/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
house_list = soup.find_all('div', class_='info clear')
for house in house_list:
date_str = house.find('div', class_='dealDate').get_text()
date = datetime.strptime(date_str, '%Y.%m.%d').date()
if date < one_year_ago:
break
count += 1
total_counts[district] = count
return total_counts
result = crawl_lianjia()
print(result)
```
请确保你已经安装了requests、BeautifulSoup库和Python的datetime模块。运行以上代码,你将得到一个字典,其中包含重庆地区万州、渝北、江北、沙坪坝、纪录片地区近一年内的二手房销售总数。每个区的内容只爬取了9页,并且限制了爬取的房屋交易日期为近一年内。
阅读全文