import csv import requests from lxml import etree if name == "main": url = 'https://heze.lianjia.com/zufang/pg%d' header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57' } #爬取源码数据 headers = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"] path = "链家菏泽租房数据.csv" with open(path, 'wb') as f: csv_write = csv.writer(f) csv_head = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"] csv_write.writerow(csv_head) for num in range(85,101): new_url = format(url%num) page_num1 = requests.get(url=new_url, headers=header) page_num1.encoding = ('utf-8') page_num = page_num1.text tree = etree.HTML(page_num) # 存储的就是标签对象 div_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div') with open(path, 'a+') as f: for div in div_list: quyu = div.xpath('./div/p[2]/a[1]//text()')[0] name = div.xpath('./div/p[2]/a[3]//text()')[0] area = div.xpath('./div/p[2]/text()')[4].split()[0] type = div.xpath('./div/p[2]/text()')[6].split()[0] place = div.xpath('./div//span/em/text()')[0] csv_write = csv.writer(f) data_row = [quyu,name,type,area,place] csv_write.writerow(data_row) print("完成")改错
时间: 2024-04-01 18:35:47 浏览: 50
代码存在一些问题,我来帮你改正:
1. 首先,缺少了一个空格,需要在第一行代码的末尾加上空格。
2. if name == "main" 这行代码应该是 if __name__ == "__main__",这是一个常用的Python代码块,表示该模块被直接运行时才会执行的代码块。
3. csv模块读写文件时要使用文本模式,即'w'或'r',而不是'wb'。因此,需要将这行代码中的'wb'改为'w'。
4. 在for循环中,第5个参数(价格)的提取方式有问题。需要将div.xpath('./div//span/em/text()')[0]改为div.xpath('./div//span/text()')[1],因为价格所在的标签中包含了一个em标签。
改正后的代码如下:
```
import csv
import requests
from lxml import etree
if __name__ == "__main__":
url = 'https://heze.lianjia.com/zufang/pg%d'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
}
#爬取源码数据
headers = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"]
path = "链家菏泽租房数据.csv"
with open(path, 'w', newline='', encoding='utf-8') as f:
csv_write = csv.writer(f)
csv_head = ["区域", "小区名称", "户型", "面积(㎡)", "价格(元/月)"]
csv_write.writerow(csv_head)
for num in range(85,101):
new_url = format(url%num)
page_num1 = requests.get(url=new_url, headers=header)
page_num1.encoding = ('utf-8')
page_num = page_num1.text
tree = etree.HTML(page_num) # 存储的就是标签对象
div_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div')
with open(path, 'a+', newline='', encoding='utf-8') as f:
csv_write = csv.writer(f)
for div in div_list:
quyu = div.xpath('./div/p[2]/a[1]//text()')[0]
name = div.xpath('./div/p[2]/a[3]//text()')[0]
area = div.xpath('./div/p[2]/text()')[4].split()[0]
type = div.xpath('./div/p[2]/text()')[6].split()[0]
place = div.xpath('./div//span/text()')[1]
data_row = [quyu,name,type,area,place]
csv_write.writerow(data_row)
print("完成")
```
阅读全文