 # Copyright (c) 2023 salesforce.com inc.
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # For full license text see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

model:
  arch: blip2_vicuna_xinstruct
  model_type: vicuna7b
  load_pretrained: True
  # pretrained: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
  load_finetuned: False
  finetuned: ""
  stage1_url_or_filename: ""
  image_model: "eva_clip_g"
  pc_model: "ulip2_pointbert"
  video_model: "eva_clip_g"
  audio_model: "beats"
  pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
  pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
  pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
  pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer.pth
  load_attention_image_qformer: True
  load_attention_pc_qformer: True
  load_attention_video_qformer: True
  load_attention_audio_qformer: True
  load_ln_type_image: "image"
  load_ln_type_video: "video"
  load_ln_type_pc: "pc"
  load_ln_type_audio: "audio"
  load_qformer_type_image: "image"
  load_qformer_type_pc: "pc"
  load_qformer_type_video: "video"
  load_qformer_type_audio: "audio"
  load_projection_image: True
  load_projection_pc: True
  load_projection_video: True
  load_projection_audio: True
  load_projection_type_image: "image"
  load_projection_type_pc: "pc"
  load_projection_type_video: "video"
  load_projection_type_audio: "audio"
  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
  pc_encoder_kwargs :  {}
  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
  audio_encoder_kwargs :  {}
  image_precision: "fp16"
  pc_precision: "fp16"
  video_precision: "fp16"
  audio_precision: "fp16"
  freeze_image: True
  freeze_pc: True
  freeze_video: True
  freeze_audio: True
  num_query_token: 32
  llm_model: /path/to/vicuna-7b
  prompt:  ""
  max_txt_len: 128
  max_output_txt_len: 256
  apply_lemmatizer: False
  num_few_shot_examples: 0
  few_shot_prob: 0
  qformer_text_input: True
  llm_text_input: True
  lora: True
  modalities :  ["image", "pc", "audio", "video"]
  use_cues: True
  shared_qformer: False
  pretrained_shared_qformer: Null
  load_attention_shared_qformer: False
  load_qformer_type_shared: ""
  load_projection_shared: False
  load_projection_type_shaped: ""
  load_ln_type_shared: ""
  shared_qformer_num_features: 512
  prefix: ""
  postfix: ""
  predict_with_gen: False

datasets:
  ## IMAGE
  ### CAPTIONING TASKS
  conceptual_caption_12m_instruct: # 6029862
    data_type: images # [images|videos|features]

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224
      
    text_processor:
      train:
        name: blip_instruction
        task: caption
        modality: image
      eval:
        name: blip_caption

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url:
              # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/cc12m/x_instructblip_clean.json
              - /export/home/.cache/lavis/clean_cc12m_org.json
          storage:
              # - cc12m/x-instructblip_cc12m_annotation.json
              - /export/home/.cache/lavis/clean_cc12m_org.json
      images:
          storage: /export/share/datasets/vision_language/cc12m_resize

  coco_caption_instruct: # 566747 train examples
    dataset_card: dataset_card/coco_caption.md
    # data_dir: ${env.data_dir}/datasets
    data_type: images # [images|videos|features]

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224
    
    text_processor:
        train:
          name: blip_instruction
          modality: image
          task: caption
        eval:
          name: blip_caption
        
    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
          md5: aa31ac474cf6250ebb81d18348a07ed8
          storage: coco/annotations/coco_karpathy_train.json
        # val:
        #   url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
        #   md5: b273847456ef5580e33713b1f7de52a0
        #   storage:  coco/annotations/coco_karpathy_val.json
        # test:
        #   url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
        #   md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
        #   storage: coco/annotations/coco_karpathy_test.json
      images:
        storage: /export/share/datasets/vision/coco/images

  capfilt14m_instruct: # 13873136
    # data_dir: ${env.data_dir}/datasets
    data_type: images # [images|videos|features]

    vis_processor:
        train:
          name: "clip_image_train"
          image_size: 224

    text_processor:
        train:
          name: blip_instruction
          modality: image
          task: caption

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: 
            - /export/share/datasets/vision_language/capfilt_14m_new/annotation.json
          storage: 
            - /export/share/datasets/vision_language/capfilt_14m_new/annotation.json
      images:
        storage: /export/share/datasets/vision/coco/images
  
  vg_caption_instruct: # 821774
    # data_dir: ${env.data_dir}/datasets
    data_type: images # [images|videos|features]

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224
    
    text_processor:
      train:
        name: blip_instruction
        task: caption
        modality: image
      eval:
        name: blip_caption

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
          storage: vg/annotations/vg_caption.json
      images:
        storage: /export/share/datasets/vision/visual-genome/ #vg/images/

  sbu_caption_instruct: # 859739
    data_type: images # [images|videos|features]

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224

    text_processor:
      train:
        name: blip_instruction
        modality: image
        task: caption
      eval:
        name: blip_caption

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url:
              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
              # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
          storage:
              - sbu_captions/annotations/sbu.json
      images:
          # storage: sbu_captions/images
          storage: /export/share/datasets/vision_language/sbu_resize


  ### QA TASKS
  vg_vqa_instruct: # 1440069 train examples
    data_type: images # [images|videos|features]

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224
    
    text_processor:
      train:
        name: blip_instruction
        task: qa
        modality: image
      eval:
        name: blip_question

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
          storage: vg/annotations/vg_qa.json
      images:
        storage: /export/share/datasets/vision/visual-genome/ #vg/images/

  coco_vqa_instruct: # 658104 training data
    # data_dir: ${env.data_dir}/datasets
    data_type: images # [images|videos|features]
    
    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224
    
    text_processor:
      train:
        name: blip_instruction
        modality: image
        task: qa
      eval:
        name: blip_caption

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url:
              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
          storage:
              - coco/annotations/vqa_train.json
              - coco/annotations/vqa_val.json
        # val:
        #   url:
        #       # TODO make this order insensitive
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
        #   storage:
        #       - coco/annotations/vqa_val_eval.json
        #       - coco/annotations/answer_list.json
        #       - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
        #       - coco/annotations/v2_mscoco_val2014_annotations.json
        # test:
        #   url:
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
          # storage:
          #     - coco/annotations/vqa_test.json
          #     - coco/annotations/answer_list.json
      images:
          storage: /export/share/datasets/vision/coco/images

  ocr_vqa_instruct: # 1002146 train examples
    # data_dir: ${env.data_dir}/datasets
    data_type: images # [images|videos|features]
    
    vis_processor:
        train:
          name: "clip_image_train"
          image_size: 224

    text_processor:
      train:
        name: blip_instruction
        modality: image
        task: qa
      eval:
        name: blip_question

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: 
            - /export/video-language-dataset/ocrvqa/ocrvqa.json
          storage: 
            - /export/video-language-dataset/ocrvqa/ocrvqa.json
      images:
        storage: /export/video-language-dataset/ocrvqa/images/

  ok_vqa_instruct: # 9009
    # data_dir: ${env.data_dir}/datasets
    data_type: images # [images|videos|features]

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224

    text_processor:
      train:
        name: blip_instruction
        modality: image
        task: qa
      eval:
        name: blip_question

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url:
              # TODO make this order insensitive
              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
          storage:
              - okvqa/annotations/okvqa_train.json
              # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
              # - okvqa/annotations/mscoco_train2014_annotations.json
        # test:
        #   url:
        #       # TODO make this order insensitive
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
        #   storage:
        #       - okvqa/annotations/vqa_val_eval.json
        #       - okvqa/annotations/answer_list.json
        #       - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
        #       - okvqa/annotations/mscoco_val2014_annotations.json
      images:
          storage: /export/share/datasets/vision/coco/images
  
  aok_vqa_instruct: # 17056
    data_type: images # [images|videos|features]

    vis_processor:
        train:
          name: "clip_image_train"
          image_size: 224
        eval:
          name: "clip_image_eval"
          image_size: 224

    text_processor:
        train:
          name: blip_instruction
          modality: image
          task: qa
        eval:
          name: blip_question

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url:
              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
          storage:
              - aokvqa/annotations/aokvqa_v1p0_train.json
        # val:
        #   url:
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
        #   storage:
        #       - aokvqa/annotations/aokvqa_v1p0_val.json
        #       - aokvqa/annotations/specialized_vocab_train_lavis.json
        #       # - aokvqa/annotations/large_vocab_train_lavis.json
        # test:
        #   url:
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
        #   storage:
        #       - aokvqa/annotations/aokvqa_v1p0_test.json
        #       - aokvqa/annotations/specialized_vocab_train_lavis.json
      images:
          storage: /export/share/datasets/vision/coco/images

  ##Dialogue
  llava150k_dialogue_instruct: #394276 train examples

    data_type: images

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224

    text_processor:
        train:
          name: "blip_caption"

    build_info:
      annotations:
        train:
          url: 
            - https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json
          storage: 
            - /export/home/LLaVA-Instruct-150K/lava_instruct_150k.json
      # Be careful not to append minus sign (-) before split to avoid itemizing
      images:
        storage: /export/share/datasets/vision/coco/images/train2017

  ## AUDIO
  audiocaps_mm_caption_instruct: #  38701 train examples
    audio_processor:
        train:
          name: beats_audio
          sampling_rate: 16000
        eval:
          name: beats_audio
          sampling_rate: 16000
          
    text_processor:
        train:
          name: "blip_instruction"
          modality: audio
          task: caption
        eval:
          name: "blip_caption"

    data_type: [audio]

    build_info:
      kwargs:
            missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ]
      annotations:
        train:
          url: 
            - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv
          storage: 
            - audiocaps/annotations/train.csv
          
        # val:
        #   url: 
        #     - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv
        #   storage: 
        #     - audiocaps/annotation/val.csv

        # test:
        #   url: 
        #     - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv
        #   storage: 
        #     - /export/einstein-vision/audio_datasets/audiocaps/dataset/test.csv
            
      audio:
        storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio


  audiocaps_mm_qa: # 24158
    audio_processor:
        train:
          name: beats_audio
          sampling_rate: 16000
        eval:
          name: beats_audio
          sampling_rate: 16000
          is_eval: True
          
    text_processor:
        train:
          name: "blip_instruction"
          modality: audio
          task: qa
        eval:
          name: "blip_question"

    data_type: [audio] 

    build_info:
      kwargs:
            missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ]
      annotations:
        train:
          url: 
            - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/audio_qa/audio_qa_final_train.csv
          storage: 
            - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/audio_qa/audio_qa_final_train.csv
          
        # val:
        #   url: 
        #     - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/audio_qa/audio_qa_final_val.csv
        #   storage: 
        #     - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/audio_qa/audio_qa_final_val.csv

      audio:
        storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio
  
  
  ## Classification like caption
  audioset_mm_caption_instruct: # 14141
    audio_processor:
        train:
          name: beats_audio
          sampling_rate: 16000
        eval:
          name: beats_audio
          sampling_rate: 16000
          is_eval: False
          
    text_processor:
        train:
          name: blip_instruction
          modality: audio
          task: classification
          cmu_dict_path: /export/home/LAVIS/cmu_dict.p
        eval:
          name: blip_caption

    data_type: [audio] 

    build_info:
      annotations:
        train:
          url: 
            - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
            - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
          storage: 
            - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
            - audioset/annotations/class_labels_indices.csv
        # val:
        #   url: 
        #     - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
        #     - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
        #   storage: 
        #     - audioset/annotations/eval_segments.csv
        #     - audioset/annotations/class_labels_indices.csv
      audio:
        storage: /export/einstein-vision/audio_datasets/AudioSet/all_audio

  wavcaps_mm_caption_instruct: # 297341 examples
    audio_processor:
        train:
          name: beats_audio
          sampling_rate: 16000
          n_frames: 2
          frame_length: 512
        eval:
          name: beats_audio
          sampling_rate: 16000
          n_frames: 2
          frame_length: 512
    text_processor:
        train:
          name: "blip_instruction"
          modality: audio
          task: caption
        eval:
          name: "blip_caption"

    data_type: [audio] 

    build_info:
      kwargs:
        json_data: /export/share/datasets/audio/WavCaps/json_data.json

        cached: False
        cached_dir: /export/share/datasets/audio/WavCaps/beats_features/

      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: 
            - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
            - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
            - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
            - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
            - /export/share/datasets/audio/WavCaps/json_data.json
          storage: 
            - /export/share/datasets/audio/WavCaps/json_files/BBC_Sound_Effects/bbc_final.json
            - /export/share/datasets/audio/WavCaps/json_files/FreeSound/fsd_final.json
            - /export/share/datasets/audio/WavCaps/json_files/SoundBible/sb_final.json
            - /export/share/datasets/audio/WavCaps/json_files/AudioSet_SL/as_final.json
            - /export/share/datasets/audio/WavCaps/json_data.json
          
      audio:
        storage: /export/share/datasets/audio/WavCaps/
  
  ## PC DATASETS
  objaverse_mm_caption_instruct: # 651576 train examples
      vis_processor:
          train:
            name: "clip_image_train"
            image_size: 224
          eval:
            name: "clip_image_train"
            image_size: 224
      pc_processor:
            train:
              name: "ulip_pc"
            eval:
              name: "ulip_pc"
      text_processor:
          train:
            name: "blip_instruction"
            modality: pc
            task: caption
          eval:
            name: "blip_caption"
            prompt: describe the 3d model.

      data_type: [pc] # [images|pc]

      build_info:
        # Be careful not to append minus sign (-) before split to avoid itemizing
        annotations:
          train:
            url: 
              - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
            storage:
              - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_train.json
          
          # val:
          #   url: 
          #     - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
          #   storage: 
          #     - /export/einstein-vision/3d_vision/objaverse_captions/objaverse_blip_captions_val.json
        
        templates: null

        pc:
          storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel
        
        images:
          storage: /export/einstein-vision/3d_vision/objaverse_captions/images/

  objaverse_mm_qa: # name of the dataset builder 250070
    vis_processor:
        train:
          name: "clip_image_train"
          image_size: 224
        eval:
          name: "clip_image_train"
          image_size: 224
    pc_processor:
          train:
            name: "ulip_pc"
          eval:
            name: "ulip_pc"
    text_processor:
        train:
          name: "blip_instruction"
          modality: pc
          task: qa
        eval:
          name: "blip_question"
    
    data_type: pc # [images|pc]

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: 
            - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/3d_qa/CAP3DQA_final.csv
          storage: 
            - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/3d_qa/CAP3DQA_final.csv
        # val:
        #   url: 
        #     - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/3d_qa/CAP3DQA_final_val.csv
        #   storage: 
        #     - /export/home/LAVIS/projects/mm_instructblip/instr_data_creation/3d_qa/CAP3DQA_final_val.csv

      templates: null

      pc:
        storage: /export/einstein-vision/3d_vision/objaverse/objaverse_pc_parallel


  msrvtt_caption_instruct: #13260
    # data_dir: ${env.data_dir}/datasets
    data_type: videos # [images|videos|features]

    vis_processor:
      train:
        name: alpro_video_train
        n_frms: 5
        image_size: 224
        min_scale: 0.9
        max_scale: 1.0
        full_video: True
      eval:
        name: alpro_video_eval
        n_frms: 5
        image_size: 224
        min_scale: 0.9
        max_scale: 1.0
        full_video: True
    
    text_processor:
      train:
        name: blip_instruction
        task: caption
        modality: video
      eval:
        name: blip_caption
        prompt: Describe the video.

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
          storage: msrvtt/annotations/cap_train.json
        # val:
        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
        #   storage: msrvtt/annotations/cap_val.json
        # test:
        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
        #   storage: msrvtt/annotations/cap_test.json
      videos:
        storage: /export/share/datasets/vision_language/msrvtt/videos

  
  msrvtt_qa_instruct: # 149075
    data_type: videos 

    vis_processor:
      train:
        name: alpro_video_train
        n_frms: 5
        image_size: 224
        min_scale: 0.9
        max_scale: 1.0
        full_video: True
      eval:
        name: alpro_video_eval
        n_frms: 5
        image_size: 224
        min_scale: 0.9
        max_scale: 1.0
        full_video: True
    
    text_processor:
      train:
        name: blip_instruction
        task: qa
        modality: video
      eval:
        name: blip_question
        
    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
          storage: msrvtt/annotations/qa_train.json
        # val:
        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
        #   storage: msrvtt/annotations/qa_val.json
        # test:
        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
        #   storage: msrvtt/annotations/qa_test.json
        # ans2label:
        #   url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
        #   storage: msrvtt/annotations/qa_ans2label.json
      videos:
        storage: /export/share/datasets/vision_language/msrvtt/videos
  
  webvid2m_caption_instruct: # 2m
    data_type: images 

    vis_processor:
          train:
            name: alpro_video_train
            n_frms: 5
            image_size: 224
            min_scale: 0.9
            max_scale: 1.0
          eval:
            name: alpro_video_eval
            n_frms: 5
            image_size: 224
            min_scale: 0.9
            max_scale: 1.0
    text_processor:
        train:
          name: "blip_instruction"
          modality: video
          task: caption
        eval:
          name: "blip_caption"

    build_info:
      # Be careful not to append minus sign (-) before split to avoid itemizing
      annotations:
        train:
          url: /export/home/LAVIS/webvid_annotation.json
          storage: /export/home/LAVIS/webvid_annotation.json
      images:
        storage: /export/video-language-dataset/data/webvid2m/postprocess/downsampled_videos

  

run:
  runner: runner_iter
  task: captioning
  # optimizer
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 1e-5
  min_lr: 0
  warmup_lr: 1e-8
  warmup_steps: 1000
  weight_decay: 0.05
  max_epoch: 40
  batch_size_train: 16
  batch_size_eval: 2
  num_workers: 0
  accum_grad_iters: 1
  max_iters: 100000
  iters_per_inner_epoch: 5000
  train_dataset_ratios: {'audiocaps_mm_caption_instruct': 0.04838977224422461, 'audiocaps_mm_qa': 0.03823164395438083, 'wavcaps_mm_caption_instruct': 0.13412813421487357, 'audioset_mm_caption_instruct': 0.029250449586520996, 'objaverse_mm_caption_instruct': 0.15436768776310836, 'objaverse_mm_qa': 0.09563231223689161, 'msrvtt_caption_instruct': 0.075, 'msrvtt_qa_instruct': 0.04466230699351647, 'webvid2m_caption_instruct': 0.16358886576195433, 'conceptual_caption_12m_instruct': 0.04859614813464941, 'coco_caption_instruct': 0.014898507905259383, 'capfilt14m_instruct': 0.07371153965255721, 'vg_caption_instruct': 0.017940079326242975, 'sbu_caption_instruct': 0.01834980589911916, 'vg_vqa_instruct': 0.02374869836697125, 'coco_vqa_instruct': 0.016054449781544723, 'ocr_vqa_instruct': 0.01981133124561304, 'ok_vqa_instruct': 0.001878392970131577, 'aok_vqa_instruct': 0.0025845608079808893, 'llava150k_dialogue_instruct': 0.012426485909930412}
  # train_dataset_ratios: {
  #                       "audiocaps_mm_caption_instruct": 0.19355908897689844, 
  #                       "audiocaps_mm_qa": 0.1529265758175233, 
  #                       "wavcaps_mm_caption_instruct": 0.5365125368594943, 
  #                        "audioset_mm_caption_instruct": 0.11700179834608398,
  #                        "objaverse_mm_caption_instruct": 0.6174707510524334, 
  #                        "objaverse_mm_qa": 0.38252924894756646,
  #                        "msrvtt_caption_instruct": .3, 
  #                        "msrvtt_qa_instruct": 0.17864922797406588 , 
  #                        "webvid2m_caption_instruct": 0.6543554630478173,
  #                        "conceptual_caption_12m_instruct": 0.19438459253859763, 
  #                        "coco_caption_instruct": 0.05959403162103753,
  #                        "capfilt14m_instruct": 0.29484615861022884,
  #                        "vg_caption_instruct": 0.0717603173049719,
  #                        'sbu_caption_instruct': 0.07339922359647665, 
  #                        'vg_vqa_instruct': 0.094994793467885, 
  #                        'coco_vqa_instruct': 0.06421779912617889,
  #                        "ocr_vqa_instruct": 0.07924532498245215,
  #                        "ok_vqa_instruct": 0.007513571880526308,
  #                        "aok_vqa_instruct": 0.010338243231923557,
  #                        'llava150k_dialogue_instruct': 0.049705943639721646
  #                        }

  max_len: 30
  min_len: 1
  num_beams: 5

  seed: 42
  output_dir: "output/xinstructblip/train/vicuna7b/lora"

  amp: True
  resume_ckpt_path: null

  evaluate: False
  train_splits: ["train"]
  # valid_splits: ["val"]
  # caption_key: caption
  # sample_id_key: youtube_id
  # annotation_file: /export/home/.cache/lavis/audiocaps_mm_caption_instruct_gt/audiocaps_mm_caption_instruct_val_annotations.json


  device: "cuda"
  world_size: 1
  dist_url: "env://"
  distributed: True
  save_freq: 1 # save epoch every xxx epochs -1 only save last and best. 
  val_freq: 1
