# @package _global_

defaults:
  - override /dataset: cyclone_small
  # - override /dataset: cyclone_single
  - override /training: muon
  - override /autoencoder: vae

training:
  # n_epochs: 200
  # batch_size: 16
  loss_type: mse

logging:
  compression_ratio: true
  tags: ["v2"]
  name_suffix: "vae"

autoencoder:
  latent_dim: 512
  act_fn: GELU
  decouple_mu: true
  init_weights: "xavier_uniform"  # "xavier_uniform", "truncnormal", "torch"
  patching_init_weights: "xavier_uniform"
  cond_init_weights: "xavier_uniform"  # "kaiming_uniform", "normal_smallvar", "xavier_uniform"
  conditioning: ["itg", "dg", "s_hat", "q"]

  # VAE specific parameters
  beta_vae: 1.0  # β-VAE weighting factor for KL divergence
  kl_warmup_epochs: 6  # Number of epochs to linearly warm up KL loss
  free_bits: 0.0  # Free bits for KL loss (prevent posterior collapse)

  loss_scheduler:
    df: null
    kl_div: null
  loss_weights:
    df: 1.0
    kl_div: 1.0

  patch:
    patch_size: [ 4, 0, 2, 5, 4 ]
    window_size: [ 4, 0, 4, 9, 4 ]

  vit:
    num_heads: [8]
    depth: [4]
    gradient_checkpoint: true
    use_abs_pe: false
    use_rpb: true
    use_rope: false
    gated_attention: false
    modulation: dit
    mid_norm_learnable: true
    norm_layer: LayerNorm

  bottleneck:
    dim: 32
    norm_learnable: true
    num_heads: 2
    depth: 2
    use_linear: false