transformer+cv
时间: 2023-11-21 14:58:43 浏览: 96
Transformer是一种强大的神经网络结构,最初被用于自然语言处理任务,但现在也被广泛应用于计算机视觉任务中。Transformer的一个主要优点是它可以处理变长的序列数据,这使得它在处理图像、视频等视觉数据时非常有用。
在计算机视觉中,Transformer通常被用于图像分类、目标检测、图像分割等任务。其中,最常见的应用是在图像分类中,使用Transformer来替代传统的卷积神经网络。这种方法被称为Vision Transformer(ViT),它将图像分成一系列的图块,并将这些图块作为输入序列传递给Transformer模型。
以下是使用PyTorch实现Vision Transformer进行图像分类的示例代码:
```python
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 定义ViT模型
class ViT(nn.Module):
def __init__(self, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim):
super(ViT, self).__init__()
assert image_size % patch_size == 0, "image size must be divisible by patch size"
num_patches = (image_size // patch_size) ** 2
patch_dim = 3 * patch_size ** 2
self.patch_size = patch_size
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
self.patch_embedding = nn.Sequential(
nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size),
nn.Flatten(start_dim=2)
)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim),
num_layers=depth
)
self.fc = nn.Linear(dim, num_classes)
def forward(self, x):
x = self.patch_embedding(x).permute(0, 2, 1)
x = torch.cat([self.pos_embedding.repeat(x.shape[0], 1, 1), x], dim=1)
x = self.transformer(x)
x = x.mean(dim=1)
x = self.fc(x)
return x
# 加载数据集
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
# 训练模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViT(image_size=224, patch_size=32, num_classes=10, dim=256, depth=6, heads=8, mlp_dim=512).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(10):
for i, (images, labels) in enumerate(train_loader):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch+1, 10, i+1, len(train_loader), loss.item()))
# 测试模型
model.eval()
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))
```
阅读全文