{
    "train_micro_batch_size_per_gpu": 3,
    "gradient_accumulation_steps": 1,
    "gradient_clipping": 1.0,
    "optimizer": {
      "type": "AdamW",
      "params": {
        "lr": 0.00005,
        "betas": [
          0.999,
          0.99 
        ],
        "eps": 1e-8
      }
    },
    "scheduler": {
      "type": "OneCycle",
      "params": {
        "cycle_first_step_size": 1000,
        "cycle_first_stair_count": 500,
        "cycle_second_step_size": 1000,
        "cycle_second_stair_count": 500,
        "decay_step_size": 10000,
        "cycle_min_lr": 0.00005,
        "cycle_max_lr": 0.0001,
        "decay_lr_rate": 0.001,
        "cycle_min_mom": 0.99,
        "cycle_max_mom": 0.98,
        "decay_mom_rate": 0.0
    }
    },
    "bf16": {
      "enabled": true
    },
    "zero_optimization": {
      "stage": 2,
      "reduce_bucket_size": 5e8,
      "zero_hpz_partition_size": 8,
      "contiguous_gradients": true,
      "overlap_comm": true,
      "reduce_scatter": true
    },
    "steps_per_print": 1,
    "flops_profiler": {
      "enabled": true,
      "profile_step": 1,
      "module_depth": -1,
      "top_modules": 5,
      "detailed": true
    },
    "tensorboard": {
      "enabled": true,
      "job_name": "tensorboard_log"
    }
  }