优化Python这段代码# coding:utf-8 import requests from bs4 import BeautifulSoup import re import os obj={} response = requests.get('http://hbjd.com.cn/article-view-id-20554.html') dom = BeautifulSoup(response.text, "html.parser") article=dom.find_all("article")[0] subdom = BeautifulSoup(str(article),"html.parser") title = subdom.find_all("h2")[0].txt obj["title"]=title p1 = subdom.find_all("p")[0].text obj["p1"]=p1 p2 = subdom.find_all("p")[1] subdom2 = BeautifulSoup(str(p2),"html.parser") p2 = subdom2.find_all("span")[1].text+subdom2.find_all("span")[2].text obj["p2"]=p2 p3 = subdom.find_all("p")[3].text obj["p3"]=p3 out = "".join(obj["p1"].split()) out1 = "".join(obj["p2"].split()) out2 = "".join(obj["p3"].split()) print (title) content = out+"\n" + out1 + "\n" + out2 print (content) with open ("article.txt","w")as fp: if content != None and len(content) >0: fp.write(str(content)) fp.flush() subdom3 = BeautifulSoup(str(subdom.find_all("p")[2]),"html.parser") img = subdom3.find_all("img")[0] url = "http://http://www.hbjd.com.cn/"+img.attrs['src'] responseImg = requests.get(url) with open ("img1.jpg","wb")as fp: if len(responseImg.content) >0: fp.write(responseImg.content) fp.flush() subdom4 = BeautifulSoup(str(subdom.find_all("p")[4]),"html.parser") img = subdom4.find_all("img")[0] url = "http://www.hbjd.com.cn/"+img.attrs['src'] responseImg = requests.get(url) with open ("img2.jpg","wb")as fp: if len(responseImg.content) > 0: fp.write(responseImg.content) fp.flush()
时间: 2024-03-29 16:37:59 浏览: 127
可以考虑以下几个方面的优化:
1. 减少网络请求:当前代码中使用了多次网络请求获取网页和图片,可以尝试使用会话(Session)来复用TCP连接,减少建立连接的时间。
2. 减少解析HTML的次数:当前代码中使用了多次BeautifulSoup来解析HTML,可以尝试在一次解析中获取需要的信息。
3. 简化字符串处理:当前代码中使用了多次字符串处理来去除空白字符,可以使用正则表达式或者Python内置的字符串方法来简化处理。
4. 异常处理:当前代码中没有进行异常处理,可以加入适当的异常处理来提高代码的健壮性。
以下是优化后的代码示例:
```python
# coding:utf-8
import requests
from bs4 import BeautifulSoup
import re
url = 'http://hbjd.com.cn/article-view-id-20554.html'
# 使用会话复用TCP连接
session = requests.Session()
response = session.get(url)
response.encoding = 'utf-8'
dom = BeautifulSoup(response.text, 'html.parser')
article = dom.find('article')
# 在一次解析中获取需要的信息
title = article.find('h2').text
p1, p2, p3 = [p.text for p in article.find_all('p')[::2]]
# 使用正则表达式简化字符串处理
out = re.sub(r'\s+', '', p1)
out1 = re.sub(r'\s+', '', p2.split()[1] + p2.split()[2])
out2 = re.sub(r'\s+', '', p3)
print(title)
content = out + '\n' + out1 + '\n' + out2
print(content)
# 添加异常处理
try:
with open('article.txt', 'w') as fp:
if content and len(content) > 0:
fp.write(content)
except Exception as e:
print('Failed to write to file:', e)
# 下载图片
for i, img in enumerate(article.find_all('img'), 1):
url = 'http://www.hbjd.com.cn' + img.attrs['src']
responseImg = session.get(url)
if responseImg.ok:
with open(f'img{i}.jpg', 'wb') as fp:
fp.write(responseImg.content)
```
阅读全文