deepspeed_zero3_cpuoffload:
  train_micro_batch_size_per_gpu: auto
  bf16:
    enabled: true
  fp16:
    enabled: false
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1
  zero_optimization:
    stage: 3
    offload_optimizer:
      device: cpu
      pin_memory: true
    overlap_comm: true
    contiguous_gradients: true
    sub_group_size: 1.0e+9
    reduce_bucket_size: auto
    stage3_prefetch_bucket_size: auto
    stage3_param_persistence_threshold: auto
    stage3_max_live_parameters: 1.0e+9
    stage3_max_reuse_distance: 1.0e+9
    stage3_gather_16bit_weights_on_model_save: true