scrapy更换代理ip
时间: 2023-06-20 18:04:47 浏览: 117
Scrapy可以通过设置中间件来实现代理IP的更换。以下是一个示例:
1. 创建一个代理中间件
```python
import random
from scrapy import signals
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
from scrapy.exceptions import NotConfigured
from urllib.parse import urlparse
class ProxyMiddleware(object):
def __init__(self, settings):
self.proxies = settings.getlist('PROXIES')
if not self.proxies:
raise NotConfigured
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_request(self, request, spider):
parsed_url = urlparse(request.url)
proxy = random.choice(self.proxies)
request.meta['proxy'] = proxy
request.meta['download_timeout'] = 30
request.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
def process_response(self, request, response, spider):
if response.status != 200:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
return self._retry(request, str(exception), spider)
def _retry(self, request, reason, spider):
if 'proxy' in request.meta:
self.proxies.remove(request.meta['proxy'])
if self.proxies:
proxy = random.choice(self.proxies)
request.meta['proxy'] = proxy
spider.logger.debug('Using proxy <%s>, %d proxies left' % (
proxy, len(self.proxies)))
return request.copy()
```
2. 在settings.py中启用代理中间件
```python
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.ProxyMiddleware': 543,
}
```
3. 在settings.py中设置代理IP列表
```python
PROXIES = [
'http://127.0.0.1:8888',
'http://127.0.0.1:8889',
'http://127.0.0.1:8890',
]
```
其中,`PROXIES`是一个代理IP列表,Scrapy在每次请求时会随机选择一个代理IP。如果请求失败,中间件会自动更换代理IP并重试。
阅读全文