pytorch音频分类
时间: 2023-06-30 19:17:15 浏览: 110
以下是使用PyTorch实现音频分类的示例代码:
```python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
# 定义音频数据集
class AudioDataset(Dataset):
def __init__(self, file_list, label_list):
self.file_list = file_list
self.label_list = label_list
def __len__(self):
return len(self.file_list)
def __getitem__(self, idx):
# 加载音频文件并提取特征
audio_file, label = self.file_list[idx], self.label_list[idx]
y, sr = librosa.load(audio_file)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
mfccs = np.pad(mfccs, ((0, 0), (0, 260 - mfccs.shape[1])), mode='constant')
mfccs = torch.from_numpy(mfccs)
return mfccs.float(), label
# 定义音频分类模型
class AudioClassifier(nn.Module):
def __init__(self):
super(AudioClassifier, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1))
self.pool1 = nn.MaxPool2d(kernel_size=(2, 2))
self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1))
self.pool2 = nn.MaxPool2d(kernel_size=(2, 2))
self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), padding=(1, 1))
self.pool3 = nn.MaxPool2d(kernel_size=(2, 2))
self.fc1 = nn.Linear(128 * 10 * 16, 512)
self.fc2 = nn.Linear(512, 10)
def forward(self, x):
x = x.unsqueeze(1)
x = self.conv1(x)
x = nn.functional.relu(x)
x = self.pool1(x)
x = self.conv2(x)
x = nn.functional.relu(x)
x = self.pool2(x)
x = self.conv3(x)
x = nn.functional.relu(x)
x = self.pool3(x)
x = x.view(-1, 128 * 10 * 16)
x = nn.functional.relu(self.fc1(x))
x = self.fc2(x)
return x
# 训练模型
def train(model, train_loader, criterion, optimizer, device):
model.train()
train_loss = 0.0
train_acc = 0.0
for i, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item() * data.size(0)
pred = output.argmax(dim=1, keepdim=True)
train_acc += pred.eq(target.view_as(pred)).sum().item()
train_loss /= len(train_loader.dataset)
train_acc /= len(train_loader.dataset)
return train_loss, train_acc
# 测试模型
def test(model, test_loader, criterion, device):
model.eval()
test_loss = 0.0
test_acc = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += criterion(output, target).item() * data.size(0)
pred = output.argmax(dim=1, keepdim=True)
test_acc += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
test_acc /= len(test_loader.dataset)
return test_loss, test_acc
# 主函数
if __name__ == '__main__':
# 加载音频数据集
train_files, train_labels = [], []
test_files, test_labels = [], []
# TODO: 加载训练集和测试集音频文件路径及其对应的标签
train_dataset = AudioDataset(train_files, train_labels)
test_dataset = AudioDataset(test_files, test_labels)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
# 定义设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 定义模型、损失函数和优化器
model = AudioClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
for epoch in range(10):
train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
test_loss, test_acc = test(model, test_loader, criterion, device)
print('Epoch: {} Train Loss: {:.6f} Train Acc: {:.6f} Test Loss: {:.6f} Test Acc: {:.6f}'.format(
epoch + 1, train_loss, train_acc, test_loss, test_acc))
```
上述代码中,我们定义了一个AudioDataset类来加载音频数据集,并使用librosa库来提取音频文件的MFCC特征。我们还定义了一个AudioClassifier类来实现音频分类模型,其中包含了三个卷积层和两个全连接层。在主函数中,我们使用DataLoader来加载训练集和测试集,并使用Adam优化器来训练模型。最后,我们在每个epoch结束时输出训练集和测试集的损失和准确率。
阅读全文