python爬虫实现获取下一页代码爬虫实现获取下一页代码
我们首先来看下实例代码:
from time import sleep
import faker
import requests
from lxml import etree
fake = faker.Faker()
base_url = "http://angelimg.spbeen.com"
def get_next_link(url):
content = downloadHtml(url)
html = etree.HTML(content)
next_url = html.xpath("//a[@class='ch next']/@href")
if next_url:
return base_url + next_url[0] else:
return False
def downloadHtml(ur):
user_agent = fake.user_agent()
headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
response = requests.get(url, headers=headers)
return response.text
def getImgUrl(content):
html = etree.HTML(content)
img_url = html.xpath('//*[@id="content"]/a/img/@src')
title = html.xpath(".//div['@class=article']/h2/text()")
return img_url[0],title[0]
def saveImg(title,img_url):
if img_url is not None and title is not None:
with open("txt/"+str(title)+".jpg",'wb') as f:
user_agent = fake.user_agent()
headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
content = requests.get(img_url, headers=headers)
#request_view(content)
f.write(content.content)
f.close()
def request_view(response):
import webbrowser
request_url = response.url
base_url = '<head><base href="%s" rel="external nofollow" >' %(request_url)
base_url = base_url.encode()
content = response.content.replace(b"<head>",base_url)
tem_html = open('tmp.html','wb')
tem_html.write(content)
tem_html.close()
webbrowser.open_new_tab('tmp.html')
def crawl_img(url):
content = downloadHtml(url)
res = getImgUrl(content)
title = res[1] img_url = res[0] saveImg(title,img_url)
if __name__ == "__main__":
url = "http://angelimg.spbeen.com/ang/4968/1"
while url:
print(url)
crawl_img(url)
url = get_next_link(url)
评论0