python用深度优先爬取网页
时间: 2023-10-11 21:14:33 浏览: 62
使用深度优先算法爬取网页的Python代码如下:
```python
from bs4 import BeautifulSoup
import urllib.request
class Stack:
def __init__(self):
self.st = []
def push(self, obj):
self.st.append(obj)
def pop(self):
return self.st.pop()
def empty(self):
return len(self.st) == 0
def visit(url):
global urls
if url in urls:
return []
urls.append(url)
try:
data = urllib.request.urlopen(url)
data = data.read()
data = data.decode()
soup = BeautifulSoup(data, 'lxml')
print(soup.find('h3').text)
links = soup.select('a')
return links
except Exception as err:
print(err)
start_url = "http://127.0.0.1:5000/"
urls = []
def spider(url):
global urls
s = Stack()
s.push(start_url)
while not s.empty():
url = s.pop()
links = visit(url)
for link in links:
url = start_url + link['href']
s.push(url)
spider(start_url)
print('the end')
```
阅读全文