# Sweep configuration for SparseLoCo on 178M model for different H,K,R combinations
name: sparseloco_sweep
project:  SparseLoCo-178M
method: grid
program: train.py
command:
  - /usr/bin/env
  - bash
  - varying_R_sweep_launcher.sh
  - ${program} 
  - ${args_no_boolean_flags}

parameters:
  # Run configuration
  strategy:
    value: sparseloco
  run_name:
    value: SparseLoCo

  # Data configuration
  shards_path:
    value: $DATA_DIR/dclm_tokenized
  token_budget:
    value: 3774873600
  sequence_length:
    value: 2048
  data_in_gpu:
    value: true

  # Model configuration
  hparams_file:
    value: hparams/178M/178M_model_hparams.json
  use_compile:
    value: true

  # Training configuration
  micro_batch_size:
    values:
    - 32
  batch_size:
    values: 
    - 32
  outer_learning_rate:
    values:
    - 1
  inner_learning_rate:
    values:
    - 1e-3
  error_decay:
    value: 0.95
  beta1:
    value: 0.9
  beta2:
    value: 0.99
  warmup_steps:
    values: 
    - 500
  weight_decay:
    value: 0.1

  # Compression configuration
  chunk_size:
    value: 64
  quantization_bins:
    value: 4
  quantization_range:
    value: 6
  use_quantization:
    value: False
  ef_freeze_steps:
    values:
    - 0.05

  top_k:
    values: # densities are calculated as top_k / chunk_size**2
    - 32   # 0.78125%
    - 64   # 1.5625%
    - 128  # 3.125%
    - 256  # 6.25%
    - 512  # 12.5%
    - 1024 # 25%
    - 2048 # 50%
  inner_steps:
    values:
    - 15
    - 50
    - 100
  R:
    values:
      - 8
      - 32