# Toy training configuration for testing

device: cuda
seed : 42

# Data configuration
data:
  train_path: data/offline_sawtooth_data # will be overridden by command line
  val_path: data/val_offline_sawtooth_data # will be overridden by command line
  num_workers: 5

# Model configuration (smaller for testing)
model:
  dim_x: 1
  dim_y: 1
  dim_model: 128
  max_buffer_size: 16
  num_target_points: 256
  targets_block_size_for_buffer_attend: 8
  q_block_size: 128
  kv_block_size: 128
  
  embedder:
    hidden_dim: 256
    depth: 3
  
  backbone:
    num_layers: 6
    num_heads: 4
    dim_feedforward: 256
    dropout: 0.0
  
  head:
    type: MixtureGaussian  # Single channel for 1D GP
    dim_feedforward: 256
    num_components: 1
    std_min: 1e-3  # Minimum std for numerical stability

# Optimizer configuration
optimizer:
  name: adamw
  lr: 1e-4
  betas: [0.9, 0.999]
  weight_decay: 0.01

# Learning rate scheduler configuration
scheduler:
  use_scheduler: true
  name: cosine_with_warmup  # Options: cosine, cosine_with_warmup
  warmup_ratio: 0.05  # 10% of total steps for warmup

# Training configuration
training:
  num_epochs: 32  # Reduced for testing with validation
  grad_clip: 0.5  # Reduced from 1.0 for stability
  compile_model: true
  compile_mask: true
  compile_mode: default
  fullgraph: false
  dynamic: false
  prewarm_compilation: true
  use_amp: false  # Disabled for numerical stability with float64 data
  amp_dtype: bfloat16
  val_interval: 1

# Checkpoint configuration
checkpoint:
  save_dir: checkpoints/sawtooth_${now:%Y-%m-%d}/${now:%H-%M-%S}
  save_interval: 5  # Save every 5 epochs

# Logging configuration
logging:
  use_wandb: true
  project: fast-buffer-np
  run_name: sawtooth-${now:%Y%m%d-%H%M%S}
  log_interval: 10
  tags: ["sawtooth", "toy"]