def logging_train_setup(args, cfg) -> None: output_dir = cfg.OUTPUT_DIR if output_dir: PathManager.mkdirs(output_dir) logger = logging.setup_logging( cfg.NUM_GPUS, get_world_size(), output_dir, name="visual_prompt") # Log basic information about environment, cmdline arguments, and config rank = get_rank() logger.info( f"Rank of current process: {rank}. World size: {get_world_size()}") logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info( "Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read() ) ) # Show the config logger.info("Training with config:") logger.info(pprint.pformat(cfg)) # cudnn benchmark has large overhead. # It shouldn't be used considering the small size of typical val set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
时间: 2023-06-24 17:05:44 浏览: 368
这段代码是用来设置训练日志的。首先,它会创建一个输出目录。然后,它会使用logging模块设置日志,其中包括环境信息、命令行参数、配置信息和当前进程的排名等。如果有配置文件,它还会将配置文件的内容记录在日志中。接着,它会显示训练配置,并设置是否使用cudnn benchmark。如果args中有eval_only属性且为True,那么不会使用cudnn benchmark。
相关问题
def train(cfg, args): # clear up residual cache from previous runs if torch.cuda.is_available(): torch.cuda.empty_cache() # main training / eval actions here # fix the seed for reproducibility if cfg.SEED is not None: torch.manual_seed(cfg.SEED) np.random.seed(cfg.SEED) random.seed(0) # setup training env including loggers logging_train_setup(args, cfg) logger = logging.get_logger("visual_prompt") train_loader, val_loader, test_loader = get_loaders(cfg, logger) logger.info("Constructing models...") model, cur_device = build_model(cfg) logger.info("Setting up Evalutator...") evaluator = Evaluator() logger.info("Setting up Trainer...") trainer = Trainer(cfg, model, evaluator, cur_device) if train_loader: trainer.train_classifier(train_loader, val_loader, test_loader) else: print("No train loader presented. Exit") if cfg.SOLVER.TOTAL_EPOCH == 0: trainer.eval_classifier(test_loader, "test", 0)
这是一个训练模型的函数,其参数包括一个配置文件和一些参数。在该函数中,首先清除了之前运行留下的缓存,然后设置了随机种子以保证可重复性,接着获取了训练、验证和测试数据集的加载器,构建了模型,设置了评估器和训练器,并调用了训练器的 train_classifier 方法进行训练。如果没有提供训练数据集,则输出 "No train loader presented. Exit"。最后,如果设置了总共的训练轮数为 0,则调用训练器的 eval_classifier 方法进行模型的测试评估。
解释parser.add_argument( "-r", "--resume", default=None, help="weights path for resume") parser.add_argument( "--slim_config", default=None, type=str, help="Configuration file of slim method.") parser.add_argument( "--enable_ce", type=bool, default=False, help="If set True, enable continuous evaluation job." "This flag is only used for internal test.") parser.add_argument( "--fp16", action='store_true', default=False, help="Enable mixed precision training.") parser.add_argument( "--fleet", action='store_true', default=False, help="Use fleet or not") parser.add_argument( "--use_vdl", type=bool, default=False, help="whether to record the data to VisualDL.") parser.add_argument( '--vdl_log_dir', type=str, default="vdl_log_dir/scalar", help='VisualDL logging directory for scalar.') parser.add_argument( '--save_prediction_only', action='store_true', default=False, help='Whether to save the evaluation results only') args = parser.parse_args() return args def run(FLAGS, cfg): # init fleet environment if cfg.fleet: init_fleet_env() else: # init parallel environment if nranks > 1 init_parallel_env() if FLAGS.enable_ce: set_random_seed(0) # build trainer trainer = Trainer(cfg, mode='train') # load weights if FLAGS.resume is not None: trainer.resume_weights(FLAGS.resume) elif 'pretrain_weights' in cfg and cfg.pretrain_weights: trainer.load_weights(cfg.pretrain_weights) # training trainer.train(FLAGS.eval) def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) cfg['fp16'] = FLAGS.fp16 cfg['fleet'] = FLAGS.fleet cfg['use_vdl'] = FLAGS.use_vdl cfg['vdl_log_dir'] = FLAGS.vdl_log_dir cfg['save_prediction_only'] = FLAGS.save_prediction_only merge_config(FLAGS.opt) place = paddle.set_device('gpu' if cfg.use_gpu else 'cpu') if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn' and not cfg.use_gpu: cfg['norm_type'] = 'bn' if FLAGS.slim_config: cfg = build_slim_model(cfg, FLAGS.slim_config) check.check_config(cfg) check.check_gpu(cfg.use_gpu) check.check_version() run(FLAGS, cfg)
这段代码是一个训练脚本的主要部分,其中包含了许多用于配置训练的命令行参数,以及定义训练过程的函数。
`parse_args()`函数使用`cli.ArgsParser()`创建一个命令行解析器,并添加了多个用于配置训练的命令行参数,如`--resume`表示恢复训练时的权重路径,`--fp16`表示是否使用混合精度训练等。
`run()`函数用于配置训练环境,包括初始化分布式训练和并行训练环境,加载模型权重,进行训练等。
`main()`函数是整个脚本的入口函数,它先解析命令行参数,然后加载配置文件,最后调用`run()`函数开始训练。
阅读全文