# Configuration for tabular regression training with 5GB dataset
# Uses variable feature dimensions (2-3) with fixed context size

device: cuda  # For GPU training

# Data configuration
data:
  train_path: data/tabular/5gb
  val_path: data/tabular/5gb  # Same path - will use chunk splitting
  use_chunk_split: true  # Use first chunk for val, rest for train
  num_chunks: 128  # Total number of chunks in dataset
  num_workers: 2  # Reduced for local testing
  cache_chunks: false  # Safer: avoid caching entire chunk files in RAM
  loader_timeout: 180  # Safer: fail fast if a worker stalls
  batch_size: null  # Each file is already a complete batch
  val_subset_size: 100  # Use first 100 batches from validation chunk
  
  # Tabular-specific settings
  dataset_type: tabular  # Important: specify tabular dataset
  feature_dims: [2, 3]  # Variable feature dimensions supported

# Model configuration
model:
  dim_x: 3  # Max feature dimension (will handle 2D and 3D)
  dim_y: 1  # 1D output (regression target)
  dim_model: 128
  max_buffer_size: 32  # Fixed 32 buffer points
  num_target_points: 256  # Fixed 256 target points
  targets_block_size_for_buffer_attend: 32  # Increased to match buffer size
  q_block_size: 128
  kv_block_size: 128
  attending_chunks: 8  # 256 targets ÷ 32 block size = 8 chunks
  
  # Use tabular embedder
  embedder:
    type: tabular  # Important: use TabularEmbedder
    hidden_dim: 256
    depth: 3
    max_dim_x: 3  # Support up to 3D features
  
  backbone:
    num_layers: 6
    num_heads: 4
    dim_feedforward: 256
    dropout: 0.0
  
  head:
    type: MixtureGaussian  # Single channel for regression
    dim_feedforward: 256
    num_components: 20
    std_min: 1e-3  # Minimum std for numerical stability

  # Precompile shapes for common context sizes with 32 buffer points
  # Format: [context+buffer, target]
  precompile_shapes:
    - [160, 256]  # context=128 + buffer=32
  
  # Training mask options
  include_diagonal_mask: false  # Use variant without diagonal self-attention during training

# Optimizer configuration
optimizer:
  name: adamw
  # Lower base LR to reduce peak step size at warmup end
  lr: 5e-5
  betas: [0.9, 0.95]
  # Disable weight decay while stabilizing MoG training
  weight_decay: 0.0

# Scheduler configuration
scheduler:
  use_scheduler: true
  name: cosine_with_warmup
  # Shorten warmup to reduce overshoot risk
  warmup_ratio: 0.02

# Training configuration
training:
  num_epochs: 50
  grad_clip: 0.5
  compile_model: true
  compile_mask: true
  compile_mode: default
  fullgraph: false
  dynamic: false
  prewarm_compilation: true
  # Train in FP32 for numerical stability
  use_amp: false
  amp_dtype: bfloat16
  val_interval: 1

# Checkpoint configuration
checkpoint:
  save_dir: checkpoints/tabular_5gb_${now:%Y-%m-%d}/${now:%H-%M-%S}
  save_interval: 10

# Logging configuration
logging:
  use_wandb: true
  project: ace-tabular
  run_name: tabular-5gb-${now:%Y%m%d-%H%M%S}
  log_interval: 50
  tags: ["ace", "tabular", "5gb", "256target", "32buffer"]
