 # Copyright (c) 2023, salesforce.com, inc.
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
  arch: blip2_vicuna_xinstruct 
  model_type: vicuna7b
  load_pretrained: True
  pretrained: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth  # https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth # or 
  load_finetuned: False
  finetuned: ""
  stage1_url_or_filename: False # https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth
  image_model: "eva_clip_g"
  pc_model: "ulip2_pointbert"
  video_model: "eva_clip_g"
  audio_model: "beats"
  pretrained_image_qformer: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" # where? change? pretrained? https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth 
  pretrained_pc_qformer:  https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth # no change?
  pretrained_video_qformer: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth # no change?
  pretrained_audio_qformer: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth # no change?
  load_attention_image_qformer: True
  load_attention_pc_qformer: False
  load_attention_video_qformer: False
  load_attention_audio_qformer: False
  load_ln_type_image: "vision"
  load_ln_type_video: ""
  load_ln_type_pc: ""
  load_ln_type_audio: ""
  load_qformer_type_image: ""
  load_qformer_type_pc: ""
  load_qformer_type_video: ""
  load_qformer_type_audio: ""
  load_projection_image: True
  load_projection_pc: False # why? Default: True (also for the following)
  load_projection_video: False
  load_projection_audio: False
  load_projection_type_image: ""
  load_projection_type_pc: ""
  load_projection_type_video: ""
  load_projection_type_audio: ""
  image_encoder_kwargs :  {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
  pc_encoder_kwargs :  {}
  video_encoder_kwargs :   {"image_size": 224, "drop_path_rate": 0, "use_grad_checkpoint": False}
  audio_encoder_kwargs :  {}
  image_precision: "fp16"
  pc_precision: "fp16"
  video_precision: "fp16"
  audio_precision: "fp16"
  freeze_image: True
  freeze_pc: True
  freeze_video: True
  freeze_audio: True
  num_query_token: 32
  llm_model: lmsys/vicuna-7b-v1.1 # HF ok?
  prompt:  "describe the image."
  max_txt_len: 128
  max_output_txt_len: 256
  apply_lemmatizer: False
  num_few_shot_examples: 0
  few_shot_prob: 0
  qformer_text_input: True
  llm_text_input: True
  modalities :  ["image"]
  use_cues: True
  shared_qformer: False
  pretrained_shared_qformer: Null
  load_attention_shared_qformer: False
  load_qformer_type_shared: ""
  load_projection_shared: False
  load_projection_type_shaped: ""
  load_ln_type_shared: ""
  shared_qformer_num_features: 512
  predict_with_gen: False
  prefix: "USER: "
  postfix: "\nASSISTANT:"
  clean_tokenization: True

datasets:
  ## CAPTIONING TASKS
  # conceptual_caption_12m_instruct: # 6029862
  #   data_type: images # [images|videos|features]

  #   vis_processor:
  #     train:
  #       name: "clip_image_train"
  #       image_size: 224
  #     eval:
  #       name: "clip_image_eval"
  #       image_size: 224
      
  #   text_processor:
  #     train:
  #       name: blip_instruction
  #       task: caption
  #       modality: image
  #     eval:
  #       name: blip_caption


  coco_caption_instruct: # 566747 train examples
    dataset_card: dataset_card/coco_caption.md
    # data_dir: 
    data_type: images # [images|videos|features]

    vis_processor:
      train:
        name: "clip_image_train"
        image_size: 224
      eval:
        name: "clip_image_eval"
        image_size: 224
    
    text_processor:
        train:
          name: blip_instruction
          modality: image
          task: caption
        eval:
          name: blip_caption


  # capfilt14m_instruct: # 13873136
  #   # data_dir: ${env.data_dir}/datasets
  #   data_type: images # [images|videos|features]

  #   vis_processor:
  #       train:
  #         name: "clip_image_train"
  #         image_size: 224

  #   text_processor:
  #       train:
  #         name: blip_instruction
  #         modality: image
  #         task: caption
  
  # vg_caption_instruct: # 821774
  #   # data_dir: ${env.data_dir}/datasets
  #   data_type: images # [images|videos|features]

  #   vis_processor:
  #     train:
  #       name: "clip_image_train"
  #       image_size: 224
  #     eval:
  #       name: "clip_image_eval"
  #       image_size: 224
    
  #   text_processor:
  #     train:
  #       name: blip_instruction
  #       task: caption
  #       modality: image
  #     eval:
  #       name: blip_caption


  # sbu_caption_instruct: # 859739
  #   data_type: images # [images|videos|features]

  #   vis_processor:
  #     train:
  #       name: "clip_image_train"
  #       image_size: 224
  #     eval:
  #       name: "clip_image_eval"
  #       image_size: 224

  #   text_processor:
  #     train:
  #       name: blip_instruction
  #       modality: image
  #       task: caption
  #     eval:
  #       name: blip_caption



  # ## QA TASKS
  # vg_vqa_instruct: # 1440069 train examples
  #   data_type: images # [images|videos|features]

  #   vis_processor:
  #     train:
  #       name: "clip_image_train"
  #       image_size: 224
  #     eval:
  #       name: "clip_image_eval"
  #       image_size: 224
    
  #   text_processor:
  #     train:
  #       name: blip_instruction
  #       task: qa
  #       modality: image
  #     eval:
  #       name: blip_question


  # coco_vqa_instruct: # 658104 training data
  #   # data_dir: ${env.data_dir}/datasets
  #   data_type: images # [images|videos|features]
    
  #   vis_processor:
  #     train:
  #       name: "clip_image_train"
  #       image_size: 224
  #     eval:
  #       name: "clip_image_eval"
  #       image_size: 224
    
  #   text_processor:
  #     train:
  #       name: blip_instruction
  #       modality: image
  #       task: qa
  #     eval:
  #       name: blip_caption


  # ocr_vqa_instruct: # 1002146 train examples
  #   # data_dir: ${env.data_dir}/datasets
  #   data_type: images # [images|videos|features]
    
  #   vis_processor:
  #       train:
  #         name: "clip_image_train"
  #         image_size: 224

  #   text_processor:
  #     train:
  #       name: blip_instruction
  #       modality: image
  #       task: qa
  #     eval:
  #       name: blip_question


  # ok_vqa_instruct: # 9009
  #   # data_dir: ${env.data_dir}/datasets
  #   data_type: images # [images|videos|features]

  #   vis_processor:
  #     train:
  #       name: "clip_image_train"
  #       image_size: 224
  #     eval:
  #       name: "clip_image_eval"
  #       image_size: 224

  #   text_processor:
  #     train:
  #       name: blip_instruction
  #       modality: image
  #       task: qa
  #     eval:
  #       name: blip_question

  
  # aok_vqa_instruct: # 17056
  #   data_type: images # [images|videos|features]

  #   vis_processor:
  #       train:
  #         name: "clip_image_train"
  #         image_size: 224
  #       eval:
  #         name: "clip_image_eval"
  #         image_size: 224

  #   text_processor:
  #       train:
  #         name: blip_instruction
  #         modality: image
  #         task: qa
  #       eval:
  #         name: blip_question

  # ##Dialogue
  # llava150k_dialogue_instruct: #394276 train examples

  #   data_type: images

  #   vis_processor:
  #     train:
  #       name: "clip_image_train"
  #       image_size: 224
  #     eval:
  #       name: "clip_image_eval"
  #       image_size: 224

  #   text_processor:
  #       train:
  #         name: "blip_caption"


run:
  runner: runner_iter
  task: captioning
  # optimizer
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 1e-5
  min_lr: 0
  warmup_lr: 1e-8
  warmup_steps: 0 # default: 1000
  weight_decay: 0.05
  max_epoch: 1 # default:40
  batch_size_train: 8 # per device, all dataset?
  batch_size_eval: 8
  num_workers: 10
  accum_grad_iters: 1 # change
  max_iters: 0 # max_epoch which? original: 65000
  iters_per_inner_epoch: 5000 # need to change?
  train_dataset_ratios: {
                        # "conceptual_caption_12m_instruct": 0.19438459253859763, 
                         "coco_caption_instruct": 1.0,
                        #  "capfilt14m_instruct": 0.29484615861022884,
                        #  "vg_caption_instruct": 0.0717603173049719,
                        #  'sbu_caption_instruct': 0.07339922359647665, 
                        #  'vg_vqa_instruct': 0.094994793467885, 
                        #  'coco_vqa_instruct': 0.06421779912617889,
                        #  "ocr_vqa_instruct": 0.07924532498245215,
                        #  "ok_vqa_instruct": 0.007513571880526308,
                        #  "aok_vqa_instruct": 0.010338243231923557,
                        #  'llava150k_dialogue_instruct': 0.049705943639721646
                         }
                        #  'laion400M_instruct': 0.1}

  max_len: 80
  min_len: 1
  num_beams: 1 # during training? set to 1?

  seed: 22
  output_dir: "output/xinstructblip/train/vicuna7b/image" # can i use instructBLIP to load it, or have to us XinstructBLIP

  amp: True
  resume_ckpt_path: null
  evaluate: False
  train_splits: ["train"]
  # valid_splits: ["val"]


  device: "cuda"
  world_size: 1
  dist_url: "env://"
  distributed: True
  save_freq: -1 # save epoch every xxx epochs -1 only save last and best. 
  val_freq: 1