# @package deepspeed_config.train
zero_optimization:
  stage: 3
  offload_param:
    # deepspeed doesn't allow dynamic offloading of model parameters
    # so set to cpu if parameter offloading is needed for memory saving
    device: "none" # set to cpu for offload, else none
  offload_optimizer: 
    device: "none" # set to cpu if using offload, else none
    pin_memory: true
  sub_group_size: "auto"
  reduce_bucket_size: "auto"
  stage3_param_persistence_threshold: "auto"
  stage3_prefetch_bucket_size: "auto"
  stage3_max_live_parameters: "auto"
  stage3_max_reuse_distance: "auto"
  round_robin_gradients: true
  # ZeRO++
  # 1 is off, set to num_gpus_per_node for ZeRO++
  zero_hpz_partition_size: 1
  zero_quantized_weights: false
  zero_quantized_gradients: false
# set true to use torch.autocast. If false, deepspeed uses its own mixed precision handling
torch_autocast:
  enabled: true
  dtype: bfloat16
# set true to save memory
# disables stage3_prefetch_bucket_size, stage3_max_live_parameters, stage3_max_reuse_distance
disable_trace_cache: false
data_types:
  grad_accum_dtype: "fp32"
gradient_clipping: 1.0
wall_clock_breakdown: false
prescale_gradient: false
