def parse_html(html): try: soup = BeautifulSoup(html, 'html.parser') title = soup.select_one('h3.tb-main-title[data-title]') return title except Exception as e: print("解析异常:", e) return None # 测试接口: url = 'https://item.taobao.com/item.htm?id=699089474285&ali_refid=a3_430673_1006:1102522391:N:%2BAkz14GC%2BXi%2Bma9dw7y9M4p5Yhq5iT8I:f9e826228fbc823934848d20e28dff81&ali_trackid=162_f9e826228fbc823934848d20e28dff81&spm=a2e0b.20350158.31919782.5' resp_text = get_with_cookie(url) if resp_text is not None: title = parse_html(resp_text) if title is not None: # 打印商品信息 print(title.text.strip()) else: print("解析失败") else: print("请求失败")解析失败。代码写的没有问题
时间: 2024-01-01 11:06:13 浏览: 34
可能是因为您使用的是 BeautifulSoup 库的 'html.parser' 解析器,在某些情况下可能无法正确解析特定的 HTML 标记或结构。您可以尝试使用其他解析器,例如 lxml 或 html5lib,看看是否能够解决问题。另外,建议您在捕获异常时打印出完整的错误信息,以便更好地排查问题。
相关问题
import requests from bs4 import BeautifulSoup session = requests.Session() cookie = 'your_cookie_here' session.headers.update({'cookie': cookie}) def get_with_cookie(url): try: resp = session.get(url) return resp.text except requests.exceptions.RequestException as e: print("请求异常:", e) return None # 解析HTML内容,提取商品名称、价格和销量信息 def parse_html(html): try: soup = BeautifulSoup(html, 'html.parser') title = soup.select_one('h3.tb-main-title[data-title]') return title except Exception as e: print("解析异常:", e) return None # 测试接口: url = 'https://item.taobao.com/item.htm?id=699089474285&ali_refid=a3_430673_1006:1102522391:N:%2BAkz14GC%2BXi%2Bma9dw7y9M4p5Yhq5iT8I:f9e826228fbc823934848d20e28dff81&ali_trackid=162_f9e826228fbc823934848d20e28dff81&spm=a2e0b.20350158.31919782.5' resp_text = get_with_cookie(url) if resp_text is not None: title = parse_html(resp_text) if title is not None: # 打印商品信息 print(title.text.strip()) else: print("解析失败") else: print("请求失败")用其他方式帮我解析对应位置的数据
可以使用正则表达式或者XPath来解析HTML内容,以下是两种方法的示例:
1. 使用正则表达式解析商品名称
```python
import re
def parse_title(html):
try:
pattern = r'"title":"(.*?)"' # 正则表达式模式
title = re.search(pattern, html).group(1) # 查找匹配的字符串并提取
return title
except Exception as e:
print("解析异常:", e)
return None
# 测试接口:
url = 'https://item.taobao.com/item.htm?id=699089474285&ali_refid=a3_430673_1006:1102522391:N:%2BAkz14GC%2BXi%2Bma9dw7y9M4p5Yhq5iT8I:f9e826228fbc823934848d20e28dff81&ali_trackid=162_f9e826228fbc823934848d20e28dff81&spm=a2e0b.20350158.31919782.5'
resp_text = get_with_cookie(url)
if resp_text is not None:
title = parse_title(resp_text)
if title is not None:
# 打印商品信息
print(title)
else:
print("解析失败")
else:
print("请求失败")
```
2. 使用XPath解析商品名称
```python
from lxml import etree
def parse_title(html):
try:
root = etree.HTML(html)
title = root.xpath('//h3[@class="tb-main-title"]/text()')[0] # 使用XPath定位节点并提取文本
return title
except Exception as e:
print("解析异常:", e)
return None
# 测试接口:
url = 'https://item.taobao.com/item.htm?id=699089474285&ali_refid=a3_430673_1006:1102522391:N:%2BAkz14GC%2BXi%2Bma9dw7y9M4p5Yhq5iT8I:f9e826228fbc823934848d20e28dff81&ali_trackid=162_f9e826228fbc823934848d20e28dff81&spm=a2e0b.20350158.31919782.5'
resp_text = get_with_cookie(url)
if resp_text is not None:
title = parse_title(resp_text)
if title is not None:
# 打印商品信息
print(title.strip())
else:
print("解析失败")
else:
print("请求失败")
```
import requests from bs4 import BeautifulSoup def getHTMLText(url): try: r = requests.get(url,timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "产生异常" def jiexi(html_doc): soup = BeautifulSoup(html_doc,"html.parser") links = soup.find_all('a') for link in links: print(link.name,link["nref"],link.get.next()) if __name__ == 'main': url = "https://wallhaven.cc/toplist?page=7" print(getHTMLText(url))改进代码
这段代码可以改进的地方有:
1. 函数名jiexi不太直观,可以改为更明确的parse_links。
2. 在解析链接时,可以加上判断,如果链接是以http或https开头的,才输出链接信息,否则忽略。
3. 在主程序中,应该调用parse_links函数,而不是直接打印HTML页面内容。
改进后的代码如下:
```
import requests
from bs4 import BeautifulSoup
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常"
def parse_links(html_doc):
soup = BeautifulSoup(html_doc, "html.parser")
links = soup.find_all('a')
for link in links:
if link.has_attr('href') and (link['href'].startswith('http') or link['href'].startswith('https')):
print(link.name, link['href'], link.get_text())
if __name__ == '__main__':
url = "https://wallhaven.cc/toplist?page=7"
html_text = get_html_text(url)
parse_links(html_text)
```
这样修改后,代码更加清晰易懂,功能也更加完善。