mode: 'train'
tag: 'baseline'
saved_model_path: 'best.pth'
save_path: 'experiments'

# TEXT ENCODER CONFIG
use_text_model: True
text_model: 'openai/clip-vit-base-patch16'
transformer_embed_dim: 512
freeze_text_encoder_weights: True
use_pretrained_clap_weights: False

# AUDIO ENCODER CONFIG
audioenc_name: 'HTSAT'
out_emb: 768
fmin: 50
fmax: 8000
n_fft: 1024
hop_size: 320
mel_bins: 64
window_size: 1024
specaug: False
mixup: False
use_pretrained_audioencoder: False
freeze_audio_encoder_weights: False

# PROJECTION SPACE CONFIG 
d_proj: 1024
temperature: 0.003

# TRAINING AND EVALUATION CONFIG
epochs: 60
evaluation_step: -1
batch_size: 384
warmup: 2000
num_workers: 0
lr: 0.0001
weight_decay: 0.0
data_parallel: False
demo: False

# DATASET CONFIGS
dataset_config: 
  use_precomputed_melspec: False
  sampling_rate: 44100
  duration: 7
  enc_text_len: 40
  dec_text_len: 77
  use_cache: False

# Decoder args
text_decoder: 'gpt2'
prefix_length: 40
prefix_length_clip: 40
mapping_type: 'transformer'
num_layers: 8
cross_attention: False
normalize_prefix: True
freeze_gpt_weights: True