llm: # LLM configuration
#  llama:
  name: "llama" # LLM name
  path: "./models/vicuna-7b-v1.5"  # LLM model path
  ckpt: ""  # LLM checkpoint path, empty means no checkpoint loading; if specified, load corresponding checkpoint
  max_tgt_len: 512 # Maximum target sequence length, larger values allow longer text generation but increase GPU memory usage
  cache_dir: "./ckpt/pretrained_ckpt" # Pretrained model cache path, must exist with write permissions
  use_fast: False  # Whether to use fast tokenizer, faster but may not support all languages
  load_in_8bit: False # Whether to use 8-bit precision (Important), reduces GPU memory usage but may lose precision
  device_8bit: 0 # Device ID when using 8-bit precision, default 0
  use_flash_attn: True # Whether to use flash attention, accelerates training and inference
  low_resource: False

  # video and audio size
  audio_size: 25 # Number of audio input bands, larger values mean higher frequency resolution
  video_size: 256 # Number of video input patches, larger values mean higher spatial resolution
  prompt: # Prompt configuration
  #   multi_prompt: False # Whether to use multiple prompts for multi-task scenarios. not used
    prompt_template: "USER: <img>:" # Prompt template 

peft:
  lora:
    # LoRA
    lora_inference_mode: False # Whether to use LoRA inference mode # check this not used
    lora_rank: 8 # LoRA rank (Important) larger values mean higher model capacity
    lora_alpha: 32 # LoRA scaling factor (Important) larger values mean stronger scaling effect
    lora_dropout: 0.1 # LoRA dropout rate
    lora_target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] # LoRA target modules

pooling:  # Alignment pooling configuration
  tokenizer_name_or_path: "./models/clip-vit-large-patch14"
  model_name_or_path: "./models/clip-vit-large-patch14"
  audio_align_pooler:  # Audio alignment pooling
    audio_projection_dim: 2048
    ot_coeff: 0.3 # OT loss weight
    use_text_mask: True
    # OT loss
    # 0.05 to 0.5: Suitable for most tasks, balances modality alignment and main task optimization. 0.1 to 0.3: Recommended default range for most cross-modal tasks (e.g., audio-video alignment, text-video alignment). > 0.5: Suitable for tasks requiring extremely high modality alignment (e.g., cross-modal retrieval, cross-modal generation).
    # got_lambda_wd: 0.3  # Wasserstein distance weight (Important) (Finetune), larger values make model more inclined to optimize modality alignment.
    # got_lambda_gwd: 0.1 # Gromov-Wasserstein distance weight, not used
    pooling_size: 512 # Pooling size
    # 2DPooling
    # 1.0: Common default, suitable for most tasks; 1.2-1.5: Suitable for classification tasks with sharper output distributions; 0.8-1.0: Suitable for generation tasks with smoother output distributions
    logit_scale_init_value: 1.00 # Initial logit scale value, range 0.7-2.0, parameter from CLIP for contrastive learning, set to 1.00 if no contrastive learning
    output_shape: null # Output shape
    kernel: [6, 8] # Convolution kernel size (Important) (Finetune)
    stride: [6, 8] # Stride (Important) (Finetune)
    global_act: softmax
    pooling_temperature: 0.1 # Pooling temperature # Should this value be slightly larger? 0.1
    output_attention: False # Whether to output attention weights

  visual_align_pooler: # Visual alignment pooling
    visual_projection_dim: 2048
    ot_coeff: 0.3 # OT loss weight
    use_text_mask: True
    pooling_size: 512 # Pooling size
    output_shape: null # Output shape
    # 3DPooling
    logit_scale_init_value: 1.00 # Initial logit scale value
    kernel: [6, 8, 8] # Convolution kernel size (Important) (Finetune)
    stride: [6, 8, 8] # Stride (Important) (Finetune)
    global_act: softmax
    pooling_temperature: 0.1 # Pooling temperature, controls pooling smoothness, smaller values mean sharper pooling
    output_attention: False # Whether to output attention weights

OT_AV:
  coeff_after: 0.3

OT_AT:
  coeff: 0.0
  use_text_mask: True

OT_VT:
  coeff: 0.0
  use_text_mask: True

audio_encoders: # Audio encoder configuration
  cache_dir: "" # Cache directory path
  seg_len: 10 # Audio segment length (Important)
  type: encoder # Encoder type

  whisper:
    path: "./models/Whisper" # Whisper model path
    freeze: True # Whether to freeze model
    target_sampling_rate: 16000 # Target sampling rate
    sin_pos: True  # Whether to use sinusoidal position encoding if audio longer than 10s, this needs to be True
    embeds_dim: 1280 # Embedding dimension

  beats:
    path: "./models/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt" # BEATs model path
    freeze: True # Whether to freeze model
    target_audio_sampling_rate: 16000 # Target sampling rate
    sin_pos: True # Whether to use sinusoidal position encoding
    embeds_dim: 768 # Embedding dimension

  whisper_beats:
    embeds_dim: 2048 # Embedding dimension 
    whisper:
      path: "./models/Whisper" # Whisper model path
      freeze: True # Whether to freeze model
      target_sampling_rate: 16000 # Target sampling rate
      sin_pos: True  # Whether to use sinusoidal position encoding if audio longer than 10s, this needs to be True

    beats:
      path: "./models/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt" # BEATs model path
      freeze: True # Whether to freeze model
      target_audio_sampling_rate: 16000 # Target sampling rate
      sin_pos: True # Whether to use sinusoidal position encoding


video_encoders:
  internvideo2:
    path: "./models/InternVideo2-Stage2_1B-224p-f4/InternVideo2-stage2_1b-224p-f4.pt" # Pretrained parameters path
    freeze: True  # Whether to freeze model
    num_frames: 10  # Video segment length, extract video features every num_frames frames (Important)
    tubelet_size: 1 # Number of frames to package into one Patch
    sep_image_video_pos_embed: True # Only used when mixing image and audio training, no need to modify temporarily
    embeds_dim: 1408  # Output dimension

connectors:
  deepseek_moe:
    hidden_size: 4096  # Hidden layer size input_hidden
    intermediate_size: 8192  # Intermediate layer size input_hidden * 2 (can be any multiplier)
    output_size: 4096  # Output size equals input_hidden
    moe_intermediate_size: 1024 # MoE intermediate layer size input_hidden // num_experts_per_tok, no strict relationship
    n_shared_experts: 2 # Number of shared experts
    n_routed_experts: 16 # Number of routed experts
    num_experts_per_tok: 4 # Number of experts per token
    norm_topk_prob: True # Whether to use topk probability
    scoring_func: "softmax" # Scoring function
    aux_loss_alpha: 0.001 # Auxiliary loss weight
    seq_aux: True # Whether to use sequence auxiliary loss
    hidden_act: "silu" # Activation function
    pretraining_tp: 2 # Pretraining TP
    fusion_type: "per_modality" # Fusion type (Important), three modes: joint, per_modality, disjoint, corresponding to FuseMoE three modes, see https://arxiv.org/abs/2402.03226 Figure 2

datasets:
  datasets: "music-avqa" # Dataset name
  train_ann_path: "./MUSIC-AVQA/annotations/avqa-train.json" # Training set annotation path
  valid_ann_path: "./MUSIC-AVQA/annotations/avqa-val.json" # Validation set annotation path
  test_ann_path: "./MUSIC-AVQA/annotations/avqa-test.json" # Test set annotation path
  
  return_raw_audios: True # Whether to return raw audios
  audio_resampling: True # Whether to perform audio resampling
  audio_sampling_rate: 16000 # Audio sampling rate
  seg_len: 10 # Audio segment length

run:
  # modality & task
  modality: "audio_video" # Modality type
  num_modality: 2 # Number of modalities
  sub_task: True # Whether there are subtasks task with sub-task, e.g., ["avqa", "count"]
  major_tasks: [Audio, Visual, Audio-Visual] # Major task classification
  minor_tasks: [Existential, Counting, Location, Comparative, Temporal]  # Subtask classification

  # log & settings
  seed: 42 # Random seed
  output_dir: "./output/equals" # Output directory, modify to your own directory

  log_main_loss: True
  log_freq: 10 # Log frequency, recommended 5 or 10 for frequent training log recording to monitor training process
  epoch_based: True # Whether to use epoch-based training (Important), if dataset is large, set to True for epoch-based training progress; if dataset is small or needs finer control, set to False for iteration-based recording
  iters_per_epoch: 0  # Iterations per epoch (Important) larger values can utilize data more fully, smaller values suitable for quick testing. Recommended to change to 100 for training
  updates_per_valid: 200 # Number of iterations between validations (Important)
  batch_size_train: 2 # Training batch size (Important) Recommended to set to maximum allowed by GPU memory (e.g., 8), larger batch sizes improve training stability but require more memory
  batch_size_eval: 2 # Evaluation batch size (Important) Same as above, e.g., set to 16
  num_workers: 0 # Number of worker threads, if data loading is slow, set to 4 or 8; if data loading is fast enough or system resources are limited, set to 0. More workers improve data loading efficiency but consume more CPU resources

  device: "cuda" # Device type
  use_distributed: True # Whether to use distributed training
  local_rank: 0  # Local process rank
  world_size: 8 # Total number of processes in distributed training
  dist_url: "env://" # URL for distributed training
  max_epoch: 1 # Maximum training epochs (Important) (Finetune)

  # optimizer & scheduler
  optims:
    warmup_steps: 100 # Warmup steps, 5%-10% of total training steps (Important)
    warmup_start_lr: 1e-6 # Warmup start learning rate, smaller values make learning rate increase more smoothly, suitable for large models or unstable training
    init_lr: 3e-5 # Initial learning rate (Important) (Finetune), typically 1e-5 to 3e-5 for fine-tuning
    min_lr: 3e-6 # Minimum learning rate (Important) (Finetune), typically 10%-50% of init_lr
