import requests from lxml import etree # headers = { # 'User-Agent': # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' # } urls = ['https://baike.pcbaby.com.cn/qzbd/'] path = r'F:\资料' def get_text(url): r = requests.get(url) r.encoding = 'utf-8' selector = etree.HTML(r.text) title = selector.xpath('/html/body/div[7]/div[1]/div[1]/div[1]/p/text()') # title2 = selector.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/div/p') print(path + title[0]) # with open(path + title[0] + title2[0],'w',encoding='utf-8')as f: # time.sleep(10) if __name__ == '__main__': for url in urls: get_text(url) 优化
时间: 2024-04-13 13:25:05 浏览: 151
from lxml import etree
你想对这段代码进行优化是吗?那我来给你一些建议。
首先,你可以将请求和解析的代码封装成一个函数,这样可以提高代码的复用性和可读性。例如:
```python
def get_page(url):
r = requests.get(url)
r.encoding = 'utf-8'
selector = etree.HTML(r.text)
return selector
def get_text(selector, xpath):
texts = selector.xpath(xpath)
if texts:
return texts[0]
return None
if __name__ == '__main__':
for url in urls:
selector = get_page(url)
title = get_text(selector, '/html/body/div[7]/div[1]/div[1]/div[1]/p/text()')
if title:
print(path + title)
```
其次,你可以将要请求的 URL 和对应的 xpath 放到一个字典中,这样可以方便管理和扩展。例如:
```python
urls = {'https://baike.pcbaby.com.cn/qzbd/': '/html/body/div[7]/div[1]/div[1]/div[1]/p/text()'}
if __name__ == '__main__':
for url, xpath in urls.items():
selector = get_page(url)
title = get_text(selector, xpath)
if title:
print(path + title)
```
另外,你可以考虑使用多线程或异步请求来提高爬取的效率。
以上是一些优化的建议,希望对你有所帮助。如果还有其他问题,请随时提问。
阅读全文