# Toy training configuration for testing
# Smoke run `python train.py --config-name train_bav`

device: cuda
seed : 42

# Data configuration
data:
  train_path: data/toy_offline_bav_rho1_data
  val_path: data/toy_offline_bav_rho1_data
  num_workers: 5

# Model configuration (smaller for testing)
model:
  dim_x: 4
  dim_y: 1
  dim_model: 128
  max_buffer_size: 16
  num_target_points: None
  targets_block_size_for_buffer_attend: 16
  q_block_size: 128
  kv_block_size: 128
  
  embedder:
    hidden_dim: 256
    depth: 3
  
  backbone:
    num_layers: 6
    num_heads: 4
    dim_feedforward: 256
    dropout: 0.0
  
  head:
    dim_feedforward: 256
    num_components: 20
  
  precompile_shapes:
    - [17, 256]   # context=1 + buffer=16
    - [32, 256]   # context=8 + buffer=16
    - [48, 256]   # context=32 + buffer=16
    - [64, 256]   # context=48 + buffer=16
    - [80, 256]   # context=64 + buffer=16
    - [96, 256]   # context=80 + buffer=16
    - [112, 256]  # context=96 + buffer=16
    - [128, 256]  # context=112 + buffer=16
    - [144, 256]  # context=128 + buffer=16
    - [160, 256]  # context=144 + buffer=16
    - [176, 256]  # context=160 + buffer=16
    - [192, 256]  # context=176 + buffer=16
    - [208, 256]  # context=192 + buffer=16
    - [224, 256]  # context=208 + buffer=16
    - [240, 256]  # context=224 + buffer=16
    - [256, 256]  # context=240 + buffer=16
    - [272, 256]  # context=256 + buffer=16
    - [288, 256]  # context=272 + buffer=16
    - [304, 256]  # context=288 + buffer=16
    - [320, 256]  # context=304 + buffer=16
    - [336, 256]  # context=320 + buffer=16
    - [352, 256]  # context=336 + buffer=16
    - [368, 256]  # context=352 + buffer=16
    - [384, 256]  # context=368 + buffer=16
    - [400, 256]  # context=384 + buffer=16
    - [416, 256]  # context=400 + buffer=16

# Optimizer configuration
optimizer:
  name: adam
  lr: 1e-4
  betas: [0.9, 0.999]
  weight_decay: 0.01

# Learning rate scheduler configuration
scheduler:
  use_scheduler: true
  name: cosine_with_warmup  # Options: cosine, cosine_with_warmup
  warmup_ratio: 0.05  # 10% of total steps for warmup

# Training configuration
training:
  num_epochs: 32  
  grad_clip: 0.5 
  compile_model: true
  compile_mask: true
  compile_mode: default
  fullgraph: false
  dynamic: false
  prewarm_compilation: false
  use_amp: false  # Disabled for numerical stability with float64 data
  amp_dtype: bfloat16
  val_interval: 1

# Checkpoint configuration
checkpoint:
  save_dir: checkpoints/bav_${now:%Y-%m-%d}/${now:%H-%M-%S}
  save_interval: 5  # Save every 5 epochs

# Logging configuration
logging:
  use_wandb: true
  project: fast-buffer-np
  run_name: bav-${now:%Y%m%d-%H%M%S}
  log_interval: 10
  tags: ["bav", "toy"]