name: swin
bundle_seq_length: 1
latent_dim: 256
num_layers: 1
decouple_mu: true
conditioning: ["timestep", "itg", "dg", "s_hat", "q"]
loss_weights:
  df: 1.0
extra_loss_weights:
  # flux: 1.0

swin:
  patch_size: [ 4, 2, 4, 5, 4 ]
  window_size: [ 4, 2, 2, 9, 4 ]
  num_heads: [8]
  depth: [2]
  gradient_checkpoint: false
  merging_hidden_ratio: 4.0
  unmerging_hidden_ratio: 8.0
  c_multiplier: 2
  norm_output: false
  use_abs_pe: false
  act_fn: GELU
  patch_skip: true
  modulation: "film"
  drop_path: 0.1
  swin_bottleneck: true
  latent_cross_attn: false
  use_rpb: true
  use_rope: false