# @package _global_
config:
  EXPR_NAME: devel
  NOTE: ""
  SEED: -1
  DATASET: movienet
  DATA_PATH: None
  NUM_MOVIES: 1100
  USE_RAW_SHOT: True
  MODEL:
    use_sync_bn: True
    shot_encoder: 
      name: resnet # [resnet, vit]
      pretrained: True
      freeze: False
      resnet:
        input_shape: [3, 224, 224]
        depth: 50
        weights: IMAGENET1K_V1 # [IMAGENET1K_V1, IMAGENET1K_V2]
        params:
          zero_init_residual: True
      vit:
        img_size: 224
        weights: vit_small_patch32_224 # [vit_small_patch32_224, vit_base_patch32_clip_224] ? vit_base_patch32_224
      vit_x_ge:
        img_size: 224
        weights: vit_base_patch32_clip_224
        ge_path: None
        wkv: linear # [linear, direct]
        ge_type: cross_attn # [cross_attn, concat] 
      ge_fusion: False
      ge_path: None
    contextual_relation_network:
      enabled: True
      name: trn
      attention_mask_type: default
      params:
        trn:
          input_dim: 384
          is_decoder: False
          add_cross_attention: False
          chunk_size_feed_forward: 0
          attention_probs_dropout_prob: 0.1
          hidden_act: gelu
          hidden_dropout_prob: 0.1
          hidden_size: 768
          intermediate_size: 3072
          layer_norm_eps: 1e-12
          num_attention_heads: 8
          num_hidden_layers: 2
          pooling_method: center
          _attn_implementation: eager # [eager, sdpa] ### added
        cat:
          input_dim: 384
          is_decoder: False
          add_cross_attention: False
          chunk_size_feed_forward: 0
          attention_probs_dropout_prob: 0.1
          attention_local_window: 5 ###
          hidden_act: gelu
          hidden_dropout_prob: 0.1
          hidden_size: 768
          intermediate_size: 3072
          layer_norm_eps: 1e-12
          num_attention_heads: 8
          num_hidden_layers: 2
          pooling_method: center
          _attn_implementation: eager # [eager, sdpa] ### added
  TRAIN:
    NUM_KEYFRAME: 3
    BATCH_SIZE:
      effective_batch_size: 8
    TRANSFORM:
      - name: VideoRandomResizedCrop
        size: 224
        bottom_area: 0.14 # 0.5 # 0.14
      - name: VideoRandomHFlip
      - name: VideoRandomColorJitter
        brightness: 0.2
        contrast: 0.2
        saturation: 0.2
        hue: 0.05
        p: 0.8
        consistent: False
      - name: VideoRandomGaussianBlur
        radius_min: 0.1
        radius_max: 2.0
        p: 0.5
      - name: VideoToTensor
        mean: [0.485, 0.456, 0.406] # [0.485, 0.456, 0.406], [0.48145466, 0.4578275, 0.40821073]
        std: [0.229, 0.224, 0.225] # [0.229, 0.224, 0.225], [0.26862954, 0.26130258, 0.27577711]
    SHUFFLE:
      enabled: False
      probability: 0.0
    NUM_WORKERS: 1
    PIN_MEMORY: False
    OPTIMIZER:
      name: lars
      weight_decay: 0.000001
      momentum: 0.9
      betas: [.9, .999]  # for Adam/AdamW
      nesterov: False
      regularize_bn: False
      regularize_bias: False
      lr:
        base_lr: 0.3 # 0.35 # 0.25 # 0.5 # 0.4 # 0.2 # 0.3
        base_lr_batch_size: 256
        auto_scale: True
      scheduler:
        name: cosine_with_linear_warmup
        warmup: 0.1  # use X-% of steps for warmup.    
  TEST:
    KNN_VALIDATION: True
    BATCH_SIZE:
      effective_batch_size: 8
    TRANSFORM:
      - name: VideoResizedCenterCrop
        image_size: 256
        crop_size: 224
      - name: VideoToTensor
        mean: [0.485, 0.456, 0.406]
        std: [0.229, 0.224, 0.225]
  LOSS:
    shot_scene_matching:
      enabled: True
      name: simclr_loss
      params:
        simclr_loss:
          temperature: 0.1
          head:  # for NCE contrastive loss
            input_dim: 384 #2048
            hidden_dim: 768 #2048
            output_dim: 128
    contextual_group_matching:
      enabled: True
    pseudo_boundary_prediction:
      enabled: True
      num_neg_sample: 1
    masked_shot_modeling:
      enabled: True
    shot_scene_matching_with_genre_embed:
      enabled: False
      name: simclr_loss
      params:
        simclr_loss:
          temperature: 0.1
          head:  # for NCE contrastive loss
            input_dim: 768
            hidden_dim: 768
            output_dim: 512
          wkv: False
          proj: False
    contextual_group_matching_with_genre_embed:
      enabled: False
    ge_path: None
    ge_learnable: False
    sampling_method:
      name: bassl
      use_duration: False
      anchor_sample_type: short_weighted
      use_random: False
      params:
        temporal:
          neighbor_size: 1
        shotcol:
          neighbor_size: 1
          neighbor_interval: 1
        bassl:
          neighbor_size: 1
          neighbor_interval: 1
        bassl+shotcol:
          neighbor_size: 1
          neighbor_interval: 1
        asymmetric:
          neighbor_left: 1
          neighbor_right: 1
          neighbor_interval: 1
  TRAINER:
    accelerator: gpu
    precision: 16
    max_epochs: 1
    detect_anomaly: False
    num_sanity_val_steps: 0          
  DISTRIBUTED:
    NUM_NODES: 1
    NUM_PROC_PER_NODE: 1
  OTHER_MODALITY:
    TYPE: []
    PLACE_PATH: None
    AUDIO_PATH: None