includes: projects/task/ft.yaml
dataset:
  meta_processor: CrossTaskMetaProcessor
  train_path: data/crosstask/crosstask_release/videos.csv  # dummy
  train_csv_path: data/crosstask/crosstask_release/videos.csv
  val_path: data/crosstask/crosstask_release/videos_val.csv  # dummy
  val_csv_path: data/crosstask/crosstask_release/videos_val.csv    
  primary_path: data/crosstask/crosstask_release/tasks_primary.txt
  related_path: data/crosstask/crosstask_release/tasks_related.txt
  vfeat_dir: data/feat/feat_crosstask_s3d
  annotation_path: data/crosstask/crosstask_release/annotations
  n_train: 30
  video_processor: CrossTaskVideoProcessor
  text_processor: CrossTaskTextProcessor
  aligner: CrossTaskAligner
  num_iso_layer: 12
  sliding_window: 16
  sliding_window_size: 32
model:
  model_cls: MMFusionActionLocalization
  mm_encoder_cls: MMBertForJoint
loss:
  loss_cls: BCE
fairseq:
  dataset:
    batch_size: 1
  optimization:
    max_epoch: 5
  checkpoint:
    save_dir: runs/task/crosstask
    restore_file: runs/task/checkpoint11.pt  # for VLM
