mode: 'train'
exp_name: 'exp'
dataset: 'AudioCaps'
text_encoder: 'bert'
joint_embed: 1024

wav:
  sr: 32000 
  window_size: 1024
  hop_length: 320
  mel_bins: 64

bert_encoder:
  type: 'bert-base-uncased'
  freeze: Yes

cnn_encoder:
  model: 'ResNet38'
  pretrained: Yes
  freeze: Yes

data:
  batch_size: 256
  num_workers: 8

training:
  epsilon: 0.05
  m: 0.95
  margin: 0.2
  freeze: No
  loss:  ntxent # 'triplet', 'weight', 'contrastive', 'pot', 'ntxent', 'wloss', 'ot'
  use_ot: True
  reg: 0
  spec_augmentation: Yes
  epochs: 10
  lr: !!float 1e-4
  clip_grad: 2
  seed: 1994
  resume: No
  l2_norm: Yes
  dropout: 0.2
  use_cosine: False
  noise_p: 0

path:
  vocabulary: 'data/{}/pickles/words_list.p'
  word2vec: 'pretrained_models/w2v_all_vocabulary.model'
  # resume_model: "rbf-output/0.1reg-reproduce_data_AudioCaps_noise0_eps0.05_m0.95_lr_0.0001_/models"
  # resume_model: "noisy-output/Triplet-baseline_data_AudioCaps_noise0_eps0.05_m0.95_lr_0.0001_/models"
  resume_model: "noisy-output/NTXent-baseline_data_AudioCaps_noise0_eps0.05_m0.95_lr_0.0001_/models"
  # resume_model: "rbf-output/NoReg-Eigen-Maha-Distance_data_AudioCaps_noise0_eps0.03_m0.95_lr_0.0001_/models"
  # resume_model: "noisy-output/NTXent-baseline-noOT_data_AudioCaps_noise0_eps0.05_m0.95_lr_0.0001_/models"
  
  # resume_model: "noisy-output/Nonoise-Maha-distance_data_AudioCaps_noise0_eps0.05_m0.95_lr_0.0001_/models"
  # resume_model: "rbf-output/1.0-rbf-Maha-Distance_data_AudioCaps_noise0_eps0.03_m0.95_lr_0.0001_/models"
