_target_: models.dit.audio_dit.InputFusionAudioDiT
img_size: 1000
patch_size: 1
in_chans: 128
out_chans: 128
input_type: '1d'
embed_dim: 1024
depth: 24
num_heads: 16
mlp_ratio: 4.0
qkv_bias: false
qk_scale: null
qk_norm: layernorm
norm_layer: layernorm
act_layer: geglu
context_norm: true
use_checkpoint: false # false
time_fusion: 'ada'
ada_sola_rank: 32
ada_sola_alpha: 32
cls_dim: null
ta_context_dim: 1024
context_dim: 1024
context_fusion: 'cross'
context_max_length: null
context_pe_method: 'none'
pe_method: 'conv'
rope_mode: 'shared'
use_conv: true
skip: true
skip_norm: true