name: distillation
backbone: dit  # dit / dimamba / hf_dit
parameterization: mean
time_conditioning: True
subs_masking: False
causal_attention: False
gumbel_tau_log10_start: -1
gumbel_tau_log10_end: -1
curriculum_start: -1
curriculum_end: -1

integral_cache_path: ${hydra:runtime.cwd}/integral/${data.tokenizer_name_or_path}.pkl
loss_type: kl-bwd  # kl-fwd, kl-bwd, posterior
update_teacher_every: 10_000
ignore_bos: False
T: 64
gamma_min: -4
gamma_max: -1
teacher_ema: False
posterior_loss_weight: 0.0
linear_growth_dt: False
linear_growth_min: 0.001
linear_growth_max: 0.25