autoencoder:
  model:
    checkpoints_prefix: autoencoder-num_latents=16-wikipedia-v1.0-128
    text_encoder: bert-base-cased
    text_encoder_freeze_params: true
    num_workers: 30
    load_checkpoint: null
  loss:
    level_weights:
    - 1.0
    - 1.0
    - 1.0
    - 1.0
    - 1.0
    - 1.0
  training:
    training_iters: 100000
    batch_size: 512
    batch_size_per_gpu: 512
  params:
    text_encoder: 0
    encoder: 99287040
    decoder: 126266162
    total: 225553202
  all_params: dict()
  logging:
    log_freq: 10
    eval_freq: 20000
    save_freq: 20000
  optimizer:
    name: stableadam
    learning_rate: 0.0002
    warmup_lr: 1.0e-08
    min_lr: 0.0001
    weight_decay: 1.0e-05
    eps: 1.0e-06
    betas:
    - 0.9
    - 0.98
    linear_warmup: 10
    grad_clip_norm: 10.0
encoder:
  attention:
    head_size: 64
    num_heads: 12
    probs_dropout: 0.0
    qk_norm: true
    implementation: flash_attention_2
  embedding:
    dim: 768
    max_position_embeddings: 128
    initializer_range: 0.02
  hidden:
    size: 768
    dropout: 0.1
    num_layers: 12
    ff_mult: 4
  latent:
    dim: 768
    num_latents: 16
  normalization:
    layer_eps: 1.0e-05
  model:
    text_encoder: bert-base-cased
    text_encoder_freeze_params: true
    mlm_probability: 0.3
    bert_masking: true
  tokens:
    vocab_size: 28996
  augmentation:
    masking:
      encodings_mlm_probability: 0.4
      weight: 0.5
    gaussian_noise:
      delta: 0.3
      weight: 0.5
decoder:
  attention:
    head_size: 64
    num_heads: 12
    probs_dropout: 0.0
    qk_norm: true
    implementation: flash_attention_2
  embedding:
    dim: 768
    max_position_embeddings: 128
    initializer_range: 0.02
  hidden:
    size: 768
    dropout: 0.1
    num_layers: 12
    ff_mult: 4
  latent:
    dim: 768
    num_latents: 16
  normalization:
    layer_eps: 1.0e-05
  model:
    text_encoder: bert-base-cased
    text_encoder_freeze_params: true
    mlm_probabilities:
    - 1.0
    bert_masking: true
  tokens:
    vocab_size: 28996
    mask_token_id: 103
  finetuning:
    max_std: 0.5
    training_iters: 1000
    logging:
      log_freq: 1
      save_freq: 1000
      eval_freq: 1000
dataset:
  name: wikipedia
  dataset_path: /home/jovyan/vmeshchaninov/echimbulatov/LatentDiffusion/data/wikipedia
  max_sequence_len: 128
  swap_cfg_coef: 0.0
  metrics:
  - mauve
  - ppl
  - div
tokenizer:
  add_special_tokens: true
  padding: true
  truncation: true
  return_tensors: pt
  return_attention_mask: true
  return_token_type_ids: false
diffusion:
  model:
    checkpoints_prefix: diffusion
    num_workers: 30
    load_checkpoint: true
  training:
    training_iters: 200000
    batch_size: 1024
    batch_size_per_gpu: 1024
  finetuning:
    is_finetuning: true
    finetuning_checkpoint: null
    get_statistics: true
  generation:
    num_gen_texts: 15000
    texts_dir_path: generated_texts
    num_texts_from_metric: 15000
    cfg_coef: 0.0
  diffusion:
    T: 1.0
    eps: 1.0e-05
    use_self_cond: true
    t_min: 0.05
    is_conditional: true
    finetune: true
    conditional_encoder: ${diffusion.diffusion.is_conditional}
    conditioning: ae
    res_connect: false
  ema:
    decay: 0.9999
  dynamic:
    'N': 200
    scheduler: tanh
    d: 5
    solver: euler
    ode_sampling: false
  params:
    score_estimator: 0
  all_params: dict()
  logging:
    log_freq: 10
    eval_freq: 10000
    save_freq: 10000
  architecture:
    unconditional_encoder:
      hidden_size: 768
      intermediate_size: 3072
      num_hidden_layers: 12
      num_attention_heads: 12
      attention_head_size: 64
      attention_probs_dropout_prob: 0.0
      layer_norm_eps: 1.0e-05
      rope_theta: 10000.0
      max_position_embeddings: 512
      embedding_dim: 768
    time_embedding:
      max_period: 10
    conditional_encoder:
      hidden_size: 768
      intermediate_size: 3072
      num_hidden_layers: 12
      num_attention_heads: 12
      attention_head_size: 64
      attention_probs_dropout_prob: 0.0
      layer_norm_eps: 1.0e-05
      rope_theta: 10000.0
      max_position_embeddings: 512
      embedding_dim: 768
  optimizer:
    name: adamw
    learning_rate: 0.0002
    warmup_lr: 1.0e-08
    min_lr: 0.0002
    weight_decay: 1.0e-05
    eps: 1.0e-06
    betas:
    - 0.9
    - 0.98
    linear_warmup: 1000
    grad_clip_norm: 1.0
project:
  path: ${oc.env:PROJECT_ROOT}
  seed: 0
  output_dir: ${project.path}/outputs/
  checkpoint_dir: ${project.path}/checkpoints/
  name: nips_submission
ddp:
  enabled: true
  local_rank: 0
  global_rank: 0
  world_size: ${oc.env:WORLD_SIZE,1}
training: autoencoder
suffix: v1.0
