deepspeed_zero2_cpuoffload:
  train_micro_batch_size_per_gpu: auto
  bf16:
    enabled: true
  fp16:
    enabled: false
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1
  zero_optimization:
    stage: 2
    offload_optimizer:
      device: cpu
      pin_memory: true
    allgather_partitions: true
    allgather_bucket_size: 1.0e+9
    overlap_comm: true
    reduce_scatter: true
    reduce_bucket_size: 5.0e+8
    contiguous_gradients: true