将一个数据集按照一定的比例,划分为训练集,验证集和测试集,请使用python写出对应的代码
时间: 2024-03-02 19:52:06 浏览: 141
当你运行代码时,你需要将数据集的路径和划分比例修改为你自己的数据集和比例。
```python
import os
import random
import shutil
def split_dataset(dataset_path, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
assert train_ratio + val_ratio + test_ratio == 1.0, 'train_ratio, val_ratio and test_ratio must sum up to 1.0'
# 创建目标文件夹
train_path = os.path.join(dataset_path, 'train')
val_path = os.path.join(dataset_path, 'val')
test_path = os.path.join(dataset_path, 'test')
for path in [train_path, val_path, test_path]:
if not os.path.exists(path):
os.makedirs(path)
# 获取所有文件路径
file_paths = []
for root, _, files in os.walk(dataset_path):
for file in files:
file_paths.append(os.path.join(root, file))
# 打乱文件路径顺序
random.shuffle(file_paths)
# 根据比例划分数据集
train_ratio_start = 0
val_ratio_start = int(len(file_paths) * train_ratio)
test_ratio_start = int(len(file_paths) * (train_ratio + val_ratio))
for i, path in enumerate(file_paths):
if i < val_ratio_start:
shutil.copy(path, os.path.join(train_path, os.path.basename(path)))
elif i < test_ratio_start:
shutil.copy(path, os.path.join(val_path, os.path.basename(path)))
else:
shutil.copy(path, os.path.join(test_path, os.path.basename(path)))
if __name__ == '__main__':
dataset_path = '/path/to/dataset'
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
split_dataset(dataset_path, train_ratio, val_ratio, test_ratio)
```
阅读全文