name: distill-mdlm
log_loss_buckets: -1

checkpoint_path: 'kuleshov-group/mdlm-owt'
start_from_hf: True

# distillation mode
distill_mode: raw-all-mask-toks  # raw / raw-all-mask-toks / kl-all-mask-toks-fwd / kl-all-mask-toks-bwd / kl-all-mask-toks-js
num_distill_steps: 4
orig_num_sampling_steps: 1024
sampling_mode: ancestral
loss_precision: null

sampling:
  uncond:
    run: False
    # Shared
    num_samples: 1024
    batch_size: 32
    from_ema: True

    # Passed to `sample`
    #n_samples: ${..batch_size}
    num_steps: 1024
    seq_len: 1024
    sampler: ancestral
    cache_preds: False
    add_bos: False
    add_eos: False

  cond_prefix:
    run: False
    # Shared
    num_samples: 1024
    batch_size: 32
    from_ema: True

    dataset: webtext
    seq_len: 100
    prefix_len: 50
    num_cont_per_prefix: 5
    min_seq_len: 1024

    num_steps: 1024
    sampler: ancestral
    cache_preds: False
    add_bos: False
    add_eos: False
