import scrapy class LianjiaSpider(scrapy.Spider): name = "lianjia" allowed_domains = ["lianjia.com"] start_urls = [f"https://bj.lianjia.com/zufang/pg{i}/#contentList" for i in range(1,3)] def parse(self, response): full_url = ['https://bj.lianjia.com' + url for url in response.xpath('//div[@class="content__list--item--main"]/p[1]/a/@href').extract()] print(full_url) all_url = [] for i in range(len(full_url)): if i % 2 == 0: all_url.append(full_url[i]) print(all_url) # # 把获得的url再次发送请求 for item in all_url: # self.parse_info是一个解析响应的方法 ''' 回调函数,Scrapy能够在接收到响应后自动调用该函数,并将响应传递给它进行处理。 这使得可以在请求不同页面时使用不同的解析方法来提取数据,实现爬虫的多页面爬取和数据处理功能 ''' yield scrapy.Request(url=item,callback=self.parse_info) # # 处理响应结果 def parse_info(self,response): # .replace(' ','').replace('\n ','') title = response.xpath("//div[@class='content clear w1150']/p/text()").get().replace(' ','').replace('\n ','') #房源标题 total_price = response.xpath('//div[@class="content__aside--title"]/span/text()|//div[@class="content__aside--title"]/text()').getall() price = ''.join(total_price).replace('\n ', '').replace(' \n ', '').replace('\n ','') print(title,price) 要跳过title为Nonetype的内容,怎样设置
时间: 2024-02-14 17:30:19 浏览: 124
要跳过 `title` 为 `NoneType` 的内容,你可以在获取 `title` 的代码段中添加条件判断来过滤掉空值。以下是修改后的代码示例:
```python
import scrapy
class LianjiaSpider(scrapy.Spider):
name = "lianjia"
allowed_domains = ["lianjia.com"]
start_urls = [f"https://bj.lianjia.com/zufang/pg{i}/#contentList" for i in range(1, 3)]
def parse(self, response):
full_url = ['https://bj.lianjia.com' + url for url in response.xpath('//div[@class="content__list--item--main"]/p[1]/a/@href').extract()]
print(full_url)
all_url = []
for i in range(len(full_url)):
if i % 2 == 0:
all_url.append(full_url[i])
print(all_url)
# 发送请求
for url in all_url:
yield scrapy.Request(url=url, callback=self.parse_info)
def parse_info(self, response):
title = response.xpath("//div[@class='content clear w1150']/p/text()")
if title:
title = title.get().replace(' ', '').replace('\n ', '') # 房源标题
total_price = response.xpath('//div[@class="content__aside--title"]/span/text()|//div[@class="content__aside--title"]/text()').getall()
price = ''.join(total_price).replace('\n ', '').replace(' \n ', '').replace('\n ', '')
print(title, price)
```
在这个示例中,我添加了一个条件判断 `if title:` 来过滤掉空值。只有当 `title` 不为空时,才会执行后续的代码。
希望这样能解决你的问题。如果还有其他问题,请随时提问。
阅读全文