# This configuration is a subset of the deepspeed hyperparameters.
name: deepspeed
defaults:
  - _default
  - _self_

# Dynamo
optimizer_context: # can be: aot_autograd_speedup, nvfuser_global, aot_nvfuser

train_batch_size: ${train.batch_size} # can be "auto"
train_micro_batch_size_per_gpu: 128 # can be "auto"

optimizer: ${train.optim}
gradient_clipping: ${train.gradient_clipping}
# DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb,
# and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.

# This scheduler is not quite the same as the schedulers called via huggingface. YMMV
scheduler:
  type: WarmupDecayLR
  params:
    warmup_min_lr: 0
    warmup_max_lr: ${train.optim.lr}
    warmup_num_steps: ${train.warmup_steps}
    warmup_type: linear
    total_num_steps: ${train.steps}

# communication_data_type: # this should be good in the default setting
# prescale_gradients: False # this should be good in the default setting
# gradient_predivide_factor: 1.0

# Do not combine these with AMP:
fp16:
  enabled: False # can be "auto"
  loss_scale: 0
  initial_scale_power: 16
  loss_scale_window: 1000
  hysteresis: 2
  min_loss_scale: 1

zero_optimization:
  # stage 0, 1, 2, and 3 refer to
  # 0) disabled
  # 1) optimizer state partitioning
  # 2) optimizer+gradient state partitioning
  # 3) optimizer+gradient+parameter partitioning
  stage: 3 # [0|1|2|3]
  overlap_comm: True # Attempts to overlap the reduction of the gradients with backward computation
  reduce_scatter: True # Uses reduce or reduce scatter instead of allreduce to average gradients
  reduce_bucket_size: 1e6 # Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
  contiguous_gradients: True # Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass.

  # Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3.
  offload_param:
    device: cpu
    pin_memory: True

  # Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU.
  #  This frees up GPU memory for larger models or batch sizes. Valid only with stage 2 and
  # Only include these options if stage=2 or higher:
  offload_optimizer:
    device: cpu
    pin_memory: True

  stage3_max_live_parameters: 1e9 # The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication.
  stage3_max_reuse_distance: 1e9 # Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication.
  stage3_prefetch_bucket_size: 0.94e6 # can be "auto" # The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication.
  stage3_param_persistence_threshold: 1e4 # can be "auto" # Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages).

  sub_group_size: 1e9
  stage3_gather_16bit_weights_on_model_save: True # [true|false]

steps_per_print: ${impl.print_loss_every_nth_step}
wall_clock_breakdown: False
dump_state: False

flops_profiler:
  enabled: False
  profile_step: 1
  module_depth: -1
  top_modules: 1
  detailed: True
  output_file: #  If None, the profiler prints to stdout..

# activation_checkpointing:
#   partition_activations: False
#   cpu_checkpointing: False
#   contiguous_memory_optimization: False
#   number_checkpoints:
#   synchronize_checkpoint_boundary: False
#   profile: False
