def parse(url): try: res = requests.get(url, headers=headers) html = etree.HTML(res.text) books = html.xpath('//ul[@class="bang_list clearfix bang_list_mode"]/li') for b in books: name = b.xpath('./div[@class="name"]/a/text()')[0] link = b.xpath('./div[@class="name"]/a/@href')[0] comment = b.xpath('./div[@class="star"]/a/text()')[0] author = b.xpath('./div[@class="publisher_info"][1]/a[1]/@title')[0] publisher = b.xpath('./div[@class="publisher_info"][2]/a/text()')[0] price_n = b.xpath('./div[@class="price"]/p[1]/span[@class="price_n"]/text()')[0] price_r = b.xpath('./div[@class="price"]/p[1]/span[@class="price_r"]/text()')[0]这段代码的意思
时间: 2023-06-24 22:08:50 浏览: 81
这段代码是一个Python函数,它接收一个URL参数,并使用requests库发送HTTP请求获取该URL对应的HTML网页。然后,使用etree库解析HTML文档,提取其中的书籍信息,并将其存储在一个字典中。
具体来说,该函数首先通过XPath表达式选取所有包含书籍信息的<li>元素。然后,对于每个<li>元素,使用XPath表达式提取书名、链接、评论、作者、出版社、价格等信息,并将其存储在一个字典中。最后,将所有字典存储在一个列表中,作为函数的返回值。
相关问题
from urllib.parse import urljoin from lxml import etree import requests url = "https://sph.pku.edu.cn/szdw/lxbywstjxx.htm" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.12022 SLBChan/25" } response = requests.get(url, headers=headers) response.encoding = response.apparent_encoding content = response.text html = etree.HTML(content) list = html.xpath("/html/body/div[3]/div/div/div[3]/div[2]/ul/li/div[1]/a") for a in list: link = a.xpath(".//@href")[0] detail_url = urljoin(url, link) # print(link) new_response = requests.get(detail_url, headers=headers) new_response.encoding = new_response.apparent_encoding new_content = new_response.text new_html = etree.HTML(new_content) div = new_html.xpath("/html/body/div/div[4]/div/div/div[3]/div[2]/form[1]/div[1]/div[2]") for div in div: name = div.xpath(".//h2/span/text()") email = div.xpath(".//ul/li[2]/span/text()") name = str(name)[2:-2] email = str(email)[2:-2] print(name) print(email)
这段代码使用Python,导入urllib.parse和lxml.etree模块以及requests模块来爬取指定网站的信息。使用requests模块发起一个GET请求获取网页内容,并使用etree模块来解析HTML文档。其中,使用xpath方法从HTML文档中获取所需信息,再使用urljoin方法将相对链接转换为绝对链接。
帮我将以下代码写注释# coding=gbk # -- coding:uft-8 -- # 贝壳网小区 import requests from lxml import etree from time import sleep import hashlib from urllib import parse import pandas as pd def getPosi(tar): try: ak = 'C8rQZy1askzzMtdY3ChAZUer1P0PRjI0' sk = 'shShi1VLCkH1gGR4v75d2LTnrn2Vm5Mg' add = f'/geocoding/v3/?address={tar}&output=json&ak={ak}&city=大连市' add = parse.quote(add, safe="/:=&?#+!$,;'@()*[]") sn = hashlib.md5(parse.quote_plus(add + sk).encode('utf-8')).hexdigest() url = f'https://api.map.baidu.com{add}&sn={sn}' dic = requests.get(url).json() lat = dic['result']['location']['lat'] lng = dic['result']['location']['lng'] return lat, lng except: return None, None def collect(): items = { 'ganjingzi': 22, 'zhongshan': 19, 'shahekou': 14, 'xigang': 12 } resLs = [] for key in items: for page in range(items[key]): page += 1 url = f'https://dl.ke.com/xiaoqu/{key}/pg{page}/' headers = { 'User-Agent': ua, 'Referer': url } while True: try: res = requests.get(url=url, headers=headers, timeout=(5, 5)).content.decode('utf-8') break except: print('again') tree = etree.HTML(res) for li in tree.xpath('//ul[@class="listContent"]/li'): href = li.xpath('./a/@href')[0] while True: try: res = requests.get(url=href, headers=headers, timeout=(5, 5)).content.decode('utf-8') break except: print('again') tree = etree.HTML(res) dic = { 'href': href, 'key': key, 'name': tree.xpath('//h1/@title')[0], 'price': (tree.xpath('//span[@class="xiaoquUnitPrice"]/text()') + [''])[0], 'property': tree.xpath('//span[@class="xiaoquInfoContent"]/text()')[1].strip(), 'building': tree.xpath('//span[@class="xiaoquInfoContent"]/text()')[4].strip(), 'house': tree.xpath('//span[@class="xiaoquInfoContent"]/text()')[5].strip() } dic['lat'], dic['lng'] = getPosi(dic['name']) print(dic) resLs.append(dic) sleep(3) df = pd.DataFrame(resLs) df.to_excel('贝壳网小区.xlsx', encoding='utf-8', index=False) if name == 'main': ua = 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/91.0.4472.106Safari/537.36' collect()
# coding=gbk # 设置文件编码为 gbk
# -- coding:uft-8 -- # 设置文件编码为 utf-8
# 贝壳网小区 # 注释,说明文件内容为贝壳网小区相关代码
import requests # 导入 requests 模块
from lxml import etree # 导入 lxml 中的 etree 模块
from time import sleep # 导入 time 模块中的 sleep 函数
import hashlib # 导入 hashlib 模块
from urllib import parse # 导入 urllib 中的 parse 模块
import pandas as pd # 导入 pandas 模块
# 定义获取位置信息的函数
def getPosi(tar):
try:
ak = 'C8rQZy1askzzMtdY3ChAZUer1P0PRjI0' # 百度地图开放平台中的AK
sk = 'shShi1VLCkH1gGR4v75d2LTnrn2Vm5Mg' # 百度地图开放平台中的SK
add = f'/geocoding/v3/?address={tar}&output=json&ak={ak}&city=大连市' # 构造请求地址
add = parse.quote(add, safe="/:=&?#+!$,;'@()*[]") # 对地址进行URL编码
sn = hashlib.md5(parse.quote_plus(add + sk).encode('utf-8')).hexdigest() # 对地址进行签名
url = f'https://api.map.baidu.com{add}&sn={sn}' # 构造完整的请求URL
dic = requests.get(url).json() # 发送请求,获取位置信息
lat = dic['result']['location']['lat'] # 获取纬度
lng = dic['result']['location']['lng'] # 获取经度
return lat, lng # 返回位置信息中的纬度与经度
except:
return None, None # 若获取位置信息失败,则返回 None
# 定义数据收集函数
def collect():
# 定义小区名称与页数的字典
items = {
'ganjingzi': 22,
'zhongshan': 19,
'shahekou': 14,
'xigang': 12
}
resLs = [] # 定义空列表,用于存储收集到的数据
for key in items: # 遍历小区名称与页数的字典
for page in range(items[key]): # 遍历每个小区的每一页
page += 1 # 页码从1开始
url = f'https://dl.ke.com/xiaoqu/{key}/pg{page}/' # 构造请求URL
headers = {
'User-Agent': ua, # 设置请求头中的User-Agent
'Referer': url # 设置请求头中的Referer
}
while True: # 循环发送请求,直到成功或超时
try:
res = requests.get(url=url, headers=headers, timeout=(5, 5)).content.decode('utf-8')
break
except:
print('again')
tree = etree.HTML(res) # 解析HTML文本
for li in tree.xpath('//ul[@class="listContent"]/li'): # 遍历每个小区的信息块
href = li.xpath('./a/@href')[0] # 获取小区详情页的URL
while True: # 循环发送请求,直到成功或超时
try:
res = requests.get(url=href, headers=headers, timeout=(5, 5)).content.decode('utf-8')
break
except:
print('again')
tree = etree.HTML(res) # 解析HTML文本
dic = { # 构造数据字典
'href': href, # 小区详情页的URL
'key': key, # 小区名称
'name': tree.xpath('//h1/@title')[0], # 小区名称
'price': (tree.xpath('//span[@class="xiaoquUnitPrice"]/text()') + [''])[0], # 房价
'property': tree.xpath('//span[@class="xiaoquInfoContent"]/text()')[1].strip(), # 物业类型
'building': tree.xpath('//span[@class="xiaoquInfoContent"]/text()')[4].strip(), # 建筑年代
'house': tree.xpath('//span[@class="xiaoquInfoContent"]/text()')[5].strip() # 房屋总数
}
dic['lat'], dic['lng'] = getPosi(dic['name']) # 获取小区位置信息
print(dic) # 输出小区信息
resLs.append(dic) # 将小区信息添加到列表中
sleep(3) # 休眠3秒,防止请求过于频繁
df = pd.DataFrame(resLs) # 将列表中的数据转换为DataFrame格式
df.to_excel('贝壳网小区.xlsx', encoding='utf-8', index=False) # 将DataFrame中的数据保存为Excel文件
if name == 'main': # 如果文件被直接执行,则执行以下代码块
ua = 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/91.0.4472.106Safari/537.36' # 设置请求头中的User-Agent
collect() # 调用数据收集函数
阅读全文