python请写一个复杂的爬虫用于获取图片和视频
时间: 2024-10-14 07:17:57 浏览: 51
Python是一种非常适合编写爬虫的高级编程语言,因为它有强大的库如`requests`, `BeautifulSoup`或`Scrapy`等可以处理网络请求和解析HTML。这里我会给出一个基本框架,展示如何使用Python编写一个简单的爬虫抓取网页上的图片和视频链接,但我们需要注意这仅用于学习和测试目的,尊重网站的robots.txt协议并遵守法律法规。
```python
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
# 定义目标网址和存储路径
url = 'https://example.com' # 替换为你想要爬取的网站地址
output_dir = 'images_and_videos'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def download_media(url, path):
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
media_type = guess_content_type(response.headers['content-type'])
save_path = f"{path}/{media_type}{os.path.basename(url)}"
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
print(f"Media saved at {save_path}")
else:
print(f"Failed to download {url}, got status code {response.status_code}")
except Exception as e:
print(f"Error downloading {url}: {e}")
def guess_content_type(header):
content_types = {
'image/jpeg': 'jpg',
'video/mp4': 'mp4',
# 添加更多格式...
}
return content_types.get(header.split(';')[0].strip(), 'unknown')
def main():
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
# 找到img标签提取图片链接
img_tags = soup.find_all('img')
for img_tag in img_tags:
img_url = img_tag['src']
download_media(img_url, output_dir + '/images')
# 查找video标签或其他可能表示视频的元素
video_tags = soup.select('video, iframe[src*="video"]') # 可能需要根据实际网站结构调整选择器
for video_tag in video_tags:
video_url = urljoin(url, video_tag['src'] if 'src' in video_tag else video_tag['iframe']['src'])
download_media(video_url, output_dir + '/videos')
if __name__ == "__main__":
main()
```
阅读全文