# Gradient Accumulation Settings
gradient_accumulation_steps: 4  # Number of steps to accumulate gradients before updating weights
train_micro_batch_size_per_gpu: 2  # Batch size per GPU during training
gradient_clipping: 3.0  # Maximum gradient norm for clipping (prevents exploding gradients)

# Mixed Precision Training (FP16) Configuration
fp16:
  enabled: True  # Enable mixed-precision training using FP16
  loss_scale: 256  # Maximum loss scaling factor
  loss_scale_window: 1000  # Number of steps for dynamic loss scaling averaging
  initial_scale_power: 16  # Initial loss scale (2^16 = 65536)
  hysteresis: 2  # Steps to delay reducing loss scale after overflow
  min_loss_scale: 1  # Minimum loss scaling factor

# ZeRO (Zero Redundancy Optimizer) Configuration
zero_optimization:
  stage: 1  # Stage 1: Optimizer state partitioning across GPUs
  # Stage 1 benefits:
  # - Reduces memory footprint of optimizer states by 4x
  # - Maintains same communication volume as baseline

# Optimizer Settings
optimizer:
  type: AdamW  # Adam optimizer with weight decay fix
  params:
    lr: 2e-5  # Base learning rate
    betas: [0.9, 0.999]  # Coefficients for computing running averages
    eps: 1e-8  # Term for numerical stability
    weight_decay: 0.01  # L2 regularization strength
    # Note: AdamW decouples weight decay from gradient updates
    # unlike vanilla Adam which couples them

# Additional Implementation Notes:
# - Effective batch size = micro_batch_size * gradient_accumulation_steps * num_gpus
# - FP16 training reduces memory usage and speeds up computation
# - Dynamic loss scaling automatically adjusts to prevent underflow
# - ZeRO Stage 1 is memory-efficient while maintaining good performance
# - Gradient clipping stabilizes training by preventing gradient explosions
    