arch: 'Transformer'
args:
  loss_names: ["mae_audio", "mae_frame", "contrastive", "vam"]
  load_local_path: ""
  init_classifier: False
  norm_pix_loss: False
  mae_loss_weight: 1.0
  contrast_loss_weight: 0.1
  vam_tvlt_loss_weight: 0.0
  get_va_recall_metric: False
  get_tvlt_va_recall_metric: False
  tau: 0.05