name: xnet_multi
bundle_seq_length: 1
latent_dim: 256
num_layers: 1
decouple_mu: true
conditioning: ["timestep", "itg", "dg", "s_hat", "q"]
loss_weights:
  df: 1.0
  phi: 0.0
  flux: 1e-5
extra_loss_weights:
  phi_int: 0.1
  flux_int: 1e-4
  phi_cross: 0.0
  flux_cross: 0.0

swin:
  patch_size: [ 4, 2, 4, 5, 4 ]
  phi_patch_size: [5, 4, 4]
  window_size: [ 4, 2, 2, 9, 4 ]
  phi_window_size: [9, 2, 4]
  num_heads: [8]
  depth: [2]
  gradient_checkpoint: false
  merging_hidden_ratio: 4.0
  unmerging_hidden_ratio: 8.0
  c_multiplier: 2
  norm_output: false
  use_abs_pe: false
  act_fn: GELU
  patch_skip: true
  modulation: "film"
  drop_path: 0.1
  swin_bottleneck: true
  latent_cross_attn: true
  use_rpb: true
  use_rope: false