mode: 'train'
exp_name: 'exp'
# dataset: 'Clotho'
dataset: 'AudioCaps' 
text_encoder: 'bert'
joint_embed: 1024

wav:
  sr: 32000 
  window_size: 1024
  hop_length: 320
  mel_bins: 64

bert_encoder:
  type: "bert-base-uncased"
  freeze: Yes

cnn_encoder:
  model: 'ResNet38'
  pretrained: Yes
  freeze: Yes

data:
  batch_size: 8
  # batch_size: 16
  num_workers: 8

training:
  epsilon: 0.03
  m: 0.95
  margin: 0.2
  freeze: No
  loss: 'ntxent' # 'triplet', 'weight', 'ot', 'ntxent', 'mahalalobis'
  use_pot: False
  reg: 0
  spec_augmentation: Yes
  epochs: 10
  separate_backbone_lr: False
  backbone_lr: !!float 5e-5
  lr: !!float 5e-5
  clip_grad: 2
  seed: 1997
  resume: Yes
  l2_norm: Yes
  dropout: 0.2
  use_cosine: False
  noise_p: 0.0
  semi_ratio: 0.0
  use_float: False
  float_lambda: 0.5
  float_reg_m: 0.05


path:
  vocabulary: 'data/{}/pickles/words_list.p'
  word2vec: 'pretrained_models/w2v_all_vocabulary.model'
  resume_model: ""
