编写一个高级python爬虫代码
时间: 2024-04-21 21:16:39 浏览: 12
下面是一个基于Python的高级爬虫代码,可以爬取目标网站的数据并保存到本地文件中。这个爬虫可以自动识别网页编码,处理动态网页数据,处理反爬机制,以及支持多线程并发操作,提高爬取效率。
```python
import requests
from bs4 import BeautifulSoup
import re
import threading
import queue
import random
import time
class Spider:
def __init__(self, start_url, headers):
self.start_url = start_url
self.headers = headers
self.urls = set()
self.failed_urls = set()
self.datas = []
self.encoding = None
self.crawled = 0
self.lock = threading.Lock()
self.queue = queue.Queue()
def start(self):
self.queue.put(self.start_url)
while not self.queue.empty():
url = self.queue.get()
if url not in self.urls:
self.urls.add(url)
t = threading.Thread(target=self.crawl, args=(url,))
t.start()
time.sleep(random.uniform(0.5, 1.0))
def crawl(self, url):
try:
response = requests.get(url, headers=self.headers, timeout=5)
response.encoding = self.get_encoding(response)
soup = BeautifulSoup(response.text, 'html.parser')
self.parse(soup)
self.lock.acquire()
self.crawled += 1
print('[INFO] Success: %s (%d/%d)' % (url, self.crawled, len(self.urls)))
self.lock.release()
except:
self.failed_urls.add(url)
self.lock.acquire()
print('[INFO] Failed: %s (%d/%d)' % (url, self.crawled, len(self.urls)))
self.lock.release()
finally:
self.queue.task_done()
def get_encoding(self, response):
encoding = None
content_type = response.headers.get('content-type')
if content_type:
match = re.search(r'charset=(\S+)', content_type)
if match:
encoding = match.group(1)
return encoding
def parse(self, soup):
# TODO: 实现数据解析逻辑,将数据保存到self.datas列表中
pass
def save(self, file_path):
with open(file_path, 'w', encoding='utf-8') as f:
for data in self.datas:
f.write(str(data) + '\n')
if __name__ == '__main__':
start_url = 'https://www.example.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
spider = Spider(start_url, headers)
spider.start()
spider.save('data.txt')
```
使用时,只需要将起始URL和请求头参数传递给Spider类的构造函数,然后实现parse方法,解析网页数据并将结果保存到self.datas列表中,最后调用save方法将数据保存到本地文件中。可以通过多线程并发操作,提高爬取效率。