slurm_config: big
task_type: local_predict
dataset:
  split: test
  video_processor: VideoProcessor
  aligner: MSRVTTQAAligner
  bert_name: bert-base-uncased
  meta_processor: MSRVTTQAMetaProcessor
  test_path: data/msrvtt-qa/MSR_MC_test.csv
  vfeat_dir: data/feat/feat_vtt_s3d
  text_processor: MSRVTTQATextProcessor
  num_iso_layer: 12
  max_video_len: 32
  max_len: 96
fairseq:
  dataset:
    batch_size: 256
    valid_subset: test
    num_workers: 2
  common_eval:
    path: runs/retri/videoclip/checkpoint_best.pt
model:
  model_cls: MMFusionSeparate
  mm_encoder_cls: null
  video_encoder_cls: MMBertForEncoder
  text_encoder_cls: BertModel
  num_hidden_video_layers: 6
eval:
  save_path: runs/retri/videoclip/vttqa_zs/eval
metric: QAMetric
predictor: QAPredictor
