import requests import os from bs4 import BeautifulSoup cookie = os.getenv('MY_COOKIE') # 保存到环境变量中 def get_with_cookie(url): headers = { 'cookie': cookie, 'pragma': 'no-cache', 'referer': 'https://uland.taobao.com/sem/tbsearch?refpid=mm_26632258_3504122_32538762&keyword=%E7%88%B1%E6%B7%98%E5%AE%9D%E4%B9%B0%E4%B8%9C%E8%A5%BF&clk1=066356e5e429004e6730976351cc1afc&upsId=066356e5e429004e6730976351cc1afc', 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': "Windows", 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50' } try: resp = requests.get(url=url, headers=headers) return resp.text except requests.exceptions.RequestException as e: print("请求异常:", e) return None # 解析HTML内容,提取商品名称、价格和销量信息 def parse_html(html): try: soup = BeautifulSoup(html, 'html.parser') title = soup.select_one('h3.tb-main-title[data-title]') return title except Exception as e: print("解析异常:", e) return None # 测试接口: url = 'https://item.taobao.com/item.htm?id=699089474285&ali_refid=a3_430673_1006:1102522391:N:%2BAkz14GC%2BXi%2Bma9dw7y9M4p5Yhq5iT8I:f9e826228fbc823934848d20e28dff81&ali_trackid=162_f9e826228fbc823934848d20e28dff81&spm=a2e0b.20350158.31919782.5' resp_text = get_with_cookie(url) if resp_text is not None: title = parse_html(resp_text) if title is not None: # 打印商品信息 print(title.text.strip()) else: print("解析失败") else: print("请求失败")代码有什么问题?
时间: 2023-11-29 12:05:37 浏览: 47
代码在第一行的 import 缺少了空格,应该写成:
```
import requests
import os
from bs4 import BeautifulSoup
```
此外,代码中使用了环境变量 `MY_COOKIE`,需要先在系统中设置这个环境变量。另外,代码中只提取了商品名称,如果还需要提取价格、销量等信息,需要在 `parse_html` 函数中添加相应的代码。
相关问题
from bs4 import BeautifulSoup # 创建一个BeautifulSoup对象 soup = BeautifulSoup(html_doc, 'html.parser')中html_doc怎么获得
在使用BeautifulSoup库之前,你需要先获取到HTML文档的内容。HTML文档可以通过多种方式获得,比如从网络上下载、从本地文件读取或者通过API获取。
以下是几种常见的获取HTML文档的方式:
1. 从网络上下载:你可以使用Python的requests库发送HTTP请求,获取网页的HTML内容。例如:
```python
import requests
response = requests.get(url)
html_doc = response.text
```
其中,`url`是你要获取的网页的URL地址。
2. 从本地文件读取:如果你已经将HTML文档保存在本地文件中,可以使用Python的文件操作来读取文件内容。例如:
```python
with open('file.html', 'r') as file:
html_doc = file.read()
```
其中,`file.html`是你保存的HTML文件的路径。
3. 通过API获取:如果你使用的是某个网站的API,可以直接调用API接口获取HTML内容。具体的方法和参数取决于API的设计和文档。
无论你使用哪种方式获取HTML文档,一旦获得了`html_doc`变量,就可以将其传递给BeautifulSoup对象进行解析和操作了。
帮我把一下代码设置一个合理请求头,并加入一个延时import requests import os from bs4 import BeautifulSoup class NovelDownloader: def __init__(self, root_url): self.root_url = root_url self.book_list = [] self.chapter_list = [] def get_url(self, url): while True: try: res = requests.get(url) if res.status_code == 200: print("页面获取成功!") return res.text else: print("页面返回异常!", res.status_code) except: print("页面获取错误!") def get_book_list(self): res = self.get_url(self.root_url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "name"}) for a in a_list: self.book_list.append(a["href"]) self.book_list = [self.root_url + i for i in self.book_list] self.book_list.remove('http://www.biquge5200.cc/') def get_chapter_list(self, url): res = self.get_url(url) html = BeautifulSoup(res, "html.parser") a_list = html.find_all("a", {"class": "chapter"}) for a in a_list: self.chapter_list.append((a["href"], a.text.replace("\n", ""))) def get_content(self, chapter): url = self.root_url + chapter[0] print(url) book_name = chapter[0].split("/")[1] print(book_name) if not os.path.exists(book_name): os.mkdir(book_name) res = self.get_url(url) html = BeautifulSoup(res, "html.parser") content = html.find("div", {"id": "content"}).text print(content) path = os.path.join(book_name, chapter[1]) with open(path, "w", encoding="utf8") as f: f.write(content) def main(self): self.get_book_list() for book in self.book_list: self.get_chapter_list(book) for chapter in self.chapter_list: self.get_content(chapter) if __name__ == '__main__': root_url = "http://www.biquge5200.cc/" nd = NovelDownloader(root_url) nd.main()
好的,我可以帮你添加一个合理的请求头和延时。在 `get_url` 方法中,你可以添加一个请求头字典,来模拟浏览器发送请求。同时,在每次请求后添加一个时间延时,避免频繁请求被网站屏蔽。
修改后的代码如下所示:
```python
import requests
import os
import time
from bs4 import BeautifulSoup
class NovelDownloader:
def __init__(self, root_url):
self.root_url = root_url
self.book_list = []
self.chapter_list = []
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
} # 添加请求头
def get_url(self, url):
while True:
try:
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
print("页面获取成功!")
return res.text
else:
print("页面返回异常!", res.status_code)
except:
print("页面获取错误!")
time.sleep(1) # 添加延时
def get_book_list(self):
res = self.get_url(self.root_url)
html = BeautifulSoup(res, "html.parser")
a_list = html.find_all("a", {"class": "name"})
for a in a_list:
self.book_list.append(a["href"])
self.book_list = [self.root_url + i for i in self.book_list]
self.book_list.remove('http://www.biquge5200.cc/')
def get_chapter_list(self, url):
res = self.get_url(url)
html = BeautifulSoup(res, "html.parser")
a_list = html.find_all("a", {"class": "chapter"})
for a in a_list:
self.chapter_list.append((a["href"], a.text.replace("\n", "")))
def get_content(self, chapter):
url = self.root_url + chapter[0]
print(url)
book_name = chapter[0].split("/")[1]
print(book_name)
if not os.path.exists(book_name):
os.mkdir(book_name)
res = self.get_url(url)
html = BeautifulSoup(res, "html.parser")
content = html.find("div", {"id": "content"}).text
print(content)
path = os.path.join(book_name, chapter[1])
with open(path, "w", encoding="utf8") as f:
f.write(content)
def main(self):
self.get_book_list()
for book in self.book_list:
self.get_chapter_list(book)
for chapter in self.chapter_list:
self.get_content(chapter)
if __name__ == '__main__':
root_url = "http://www.biquge5200.cc/"
nd = NovelDownloader(root_url)
nd.main()
```
这样就可以更好地模拟浏览器发送请求,避免被网站屏蔽。另外,可以根据需要调整延时时间,以达到更好的效果。
相关推荐
![pdf](https://img-home.csdnimg.cn/images/20210720083512.png)
![py](https://img-home.csdnimg.cn/images/20210720083646.png)
![pdf](https://img-home.csdnimg.cn/images/20210720083512.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)