training:
    random_seed: 123
    num_gpus_per_node: 1
    local_sgd_steps: 16
    local_sgd_warmup_steps: 99999999
    batch_size: None
    resume:
        resume: False
        resume_model: True
        resume_optimizer: True
        resume_scheduler: True
        resume_rng_state: True
    checkpointing:
        async_save: false
        directory: "Checkpoints"
        steps_interval: -1
        seconds_interval: 99999999
        num_checkpoints_to_keep: 1000
        keep_checkpoint_every_num_seconds: 86400
    logging:
        level: "INFO"
        steps_interval: -1 # disabled when negative
        seconds_interval: 2 # disabled when `steps_interval` is set
    optimization:
        fp16: True
        optimizer_name: Custom
        learning_rate: 2e-4
        gradient_accumulation_steps: 1
        weight_decay: 0.01
        max_gradient_norm: -1.0
        warmup:
            scheduler_name: Custom
            warmup_steps: 2000
            cycle_steps: 16000
    validation:
        steps_interval: 2000 # -1 for after every epoch, but will be disabled if total.num.epochs = -1
        after_num_steps: 15000
    total_num:
        epochs: -1
        update_steps: 200000 # disabled when total_num_epochs > 0