experiment_name: "vit-t=5"

logger:
  project: "sonics"
  primary_metric: "f1"

environment:
  seed: 42
  mixed_precision: true
  num_workers: 8

dataset:
  train_dataframe: "train.csv"
  valid_dataframe: "valid.csv"
  test_dataframe: "test.csv"

audio:
  sample_rate: 16000
  max_time: 5  # in seconds
  random_sampling: true
  normalize: true
  skip_time: false

melspec:
  n_fft: 2048
  hop_length: 512
  win_length: 2048
  n_mels: 128
  f_min: 20
  f_max: 8000
  power: 2
  top_db: 80
  norm: "mean_std"  # Options: "min_max", "mean_std", "simple", or null for no normalization

model:
  name: "ViT"  # Options: "SpecTTTra" or "ViT"
  input_shape: [128, 128] # [n_mels, n_frames]
  embed_dim: 384
  num_heads: 6
  num_layers: 12
  patch_size: 16
  patch_norm: true # Normalize patch embeddings
  pe_learnable: true
  pos_drop_rate: 0.1
  attn_drop_rate: 0.1
  proj_drop_rate: 0.0
  mlp_ratio: 2.67
  use_init_weights: false # Use custom initialization weights
  resume: null

loss:
  name: "BCEWithLogitsLoss"
  label_smoothing: 0.02

num_classes: 1  # Adjust based on your classification task

training:
  batch_size: 256
  epochs: 50

validation:
  batch_size: 256
    
optimizer:
  opt: "adamw"
  opt_eps: 0.00000001
  opt_betas: [0.9, 0.999]
  momentum: 0.9
  weight_decay: 0.05
  grad_accum_steps: 1
  clip_grad_norm: 5.0


scheduler:
  sched: "cosine"
  lr: 0.0008 # overrides lr-base if set
  lr_base: 0.001 # base learning rate: lr = lr_base * global_batch_size / base_size
  lr_base_size: 256 # base learning rate batch size
  lr_base_scale: "linear" # learning rate vs batch_size scaling ("linear", "sqrt")
  warmup_lr: 0.000001
  min_lr: 0.0
  warmup_epochs: 5
  decay_rate: 0.1     # Type of scheduler: cosine, exp, step

augment:
  mixup_alpha: 2.5
  mixup_p: 0.5
  n_time_masks: 2
  time_mask_param: 8
  n_freq_masks: 1
  freq_mask_param: 8
  time_freq_mask_p: 0.5