import scrapy class LianjiaSpider(scrapy.Spider): name = "lianjia" allowed_domains = ["lianjia.com"] start_urls = [f"https://bj.lianjia.com/zufang/pg{i}/#contentList" for i in range(1,3)] def parse(self, response): full_url = ['https://bj.lianjia.com' + url for url in response.xpath('//div[@class="content__list--item--main"]/p[1]/a/@href').extract()] print(full_url) all_url = [] for i in range(len(full_url)): if i % 2 == 0: all_url.append(full_url[i]) print(all_url) # # 把获得的url再次发送请求 for item in all_url: # self.parse_info是一个解析响应的方法 ''' 回调函数,Scrapy能够在接收到响应后自动调用该函数,并将响应传递给它进行处理。 这使得可以在请求不同页面时使用不同的解析方法来提取数据,实现爬虫的多页面爬取和数据处理功能 ''' yield scrapy.Request(url=item,callback=self.parse_info) # # 处理响应结果 def parse_info(self,response): # .replace(' ','').replace('\n ','') title = response.xpath("//div[@class='content clear w1150']/p/text()").get().replace(' ','').replace('\n ','') #房源标题 total_price = response.xpath('//div[@class="content__aside--title"]/span/text()|//div[@class="content__aside--title"]/text()').getall() price = ''.join(total_price).replace('\n ', '').replace(' \n ', '').replace('\n ','') print(title,price) 这段代码中怎样跳过空的Nonetype,把不为空的保留
时间: 2024-02-14 13:30:19 浏览: 156
scrapy_store.steampowered.com:一个在http上检索游戏信息的项目
要跳过空的 `NoneType`,你可以在 `for` 循环中使用条件判断来过滤掉空的值,只保留不为空的值。以下是示例代码:
```python
import scrapy
class LianjiaSpider(scrapy.Spider):
name = "lianjia"
allowed_domains = ["lianjia.com"]
start_urls = [f"https://bj.lianjia.com/zufang/pg{i}/#contentList" for i in range(1, 3)]
def parse(self, response):
full_url = [
'https://bj.lianjia.com' + url for url in response.xpath(
'//div[@class="content__list--item--main"]/p[1]/a/@href'
).extract()
]
print(full_url)
all_url = []
for i in range(len(full_url)):
if i % 2 == 0:
all_url.append(full_url[i])
print(all_url)
# 把获得的url再次发送请求
for item in all_url:
if item is not None: # 跳过空的NoneType值
yield scrapy.Request(url=item, callback=self.parse_info)
# 处理响应结果
def parse_info(self, response):
title = response.xpath(
"//div[@class='content clear w1150']/p/text()"
).get().replace(' ', '').replace('\n ', '') # 房源标题
total_price = response.xpath(
'//div[@class="content__aside--title"]/span/text()|//div[@class="content__aside--title"]/text()'
).getall()
price = ''.join(total_price).replace('\n ', '').replace(' \n ', '').replace('\n ', '')
print(title, price)
```
在这个示例中,我添加了一个 `if item is not None` 的条件判断来跳过空的 `NoneType` 值,只处理不为空的值。
希望这样能解决你的问题。如果还有其他问题,请随时提问。
阅读全文