name: opv2v_m1m2m3m4_pyramid
root_dir: "dataset/OPV2V/train"
validate_dir: "dataset/OPV2V/validate"
test_dir: "dataset/OPV2V/test"

yaml_parser: "load_general_params"

train_setting:
  pc_grad: &pc_grad False
  pc_grad_group: False
  add:
    train_params:
      batch_size: 2
      epoches: &epoches_nego 12
      eval_freq: 1
      save_freq: 1
      max_cav: 5

    optimizer:
      core_method: Adam
      lr: 0.01
      args:
        eps: 1e-10
        weight_decay: 1e-4
    lr_scheduler:
      core_method: multistep #step, multistep and Exponential support
      gamma: 0.1
      step_size: [1, 15, 20]

comm_range: 70
input_source: ['lidar', 'camera', 'depth']
label_type: 'lidar'
cav_lidar_range: &cav_lidar [-102.4, -51.2, -3, 102.4, 51.2, 1]


heter:
  assignment_path: "opencood/logs/heter_modality_assign/opv2v_4modality.json" 
  ego_modality: &ego_modality "m1&m2&m3&m4"
  mapping_dict: &mapping_dict
    m1: m3
    m2: m1
    m3: m4
    m4: m2
  lidar_channels_dict:
    m3: 32
  modality_setting: &modality_setting
    m1:
      sensor_type: &sensor_type_m1 'lidar'
      core_method: &core_method_m1 "point_pillar"

      # lidar requires preprocess
      preprocess:
        # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor
        core_method: 'SpVoxelPreprocessor'
        args:
          voxel_size: &voxel_size [0.4, 0.4, 4]
          max_points_per_voxel: 32
          max_voxel_train: 32000
          max_voxel_test: 70000
        # lidar range for each individual cav.
        cav_lidar_range: *cav_lidar
    m2:
      sensor_type: &sensor_type_m2 'camera'
      core_method: &core_method_m2 "lift_splat_shoot"

      grid_conf: &grid_conf_m2
        xbound: [-51.2, 51.2, 0.4]   # Limit the range of the x direction and divide the grids
        ybound: [-51.2, 51.2, 0.4]   # Limit the range of the y direction and divide the grids
        zbound: [-10, 10, 20.0]   # Limit the range of the z direction and divide the grids
        ddiscr: [2, 50, 48]
        mode: 'LID'
      data_aug_conf: &data_aug_conf_m2
        resize_lim: [0.65, 0.7]
        final_dim: [384, 512]
        rot_lim: [-3.6, 3.6]
        H: 600
        W: 800
        rand_flip: False
        bot_pct_lim: [0.0, 0.05]
        cams: ['camera0', 'camera1', 'camera2', 'camera3']
        Ncams: 4
    m3:
      sensor_type: &sensor_type_m3 'lidar'
      core_method: &core_method_m3 "second"

      # lidar requires preprocess
      preprocess:
        # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor
        core_method: 'SpVoxelPreprocessor'
        args:
          voxel_size: &voxel_size_m3 [0.1, 0.1, 0.1]
          max_points_per_voxel: 5
          max_voxel_train: 32000
          max_voxel_test: 70000
        # lidar range for each individual cav.
        cav_lidar_range: *cav_lidar
    m4:
      sensor_type: &sensor_type_m4 'camera'
      core_method: &core_method_m4 "lift_splat_shoot"

      grid_conf: &grid_conf_m4
        xbound: [-48, 48, 0.4]   # 限制x方向的范围并划分网格
        ybound: [-48, 48, 0.4]   # 限制y方向的范围并划分网格
        zbound: [-10, 10, 20.0]   # 限制z方向的范围并划分网格
        ddiscr: [2, 50, 48]
        mode: 'LID'
      data_aug_conf: &data_aug_conf_m4
        resize_lim: [0.56, 0.61]
        final_dim: [336, 448]
        rot_lim: [-3.6, 3.6]
        H: 600
        W: 800
        rand_flip: False
        bot_pct_lim: [0.0, 0.05]
        cams: ['camera0', 'camera1', 'camera2', 'camera3']
        Ncams: 4


fusion:
  # core_method: 'intermediateheterpair'
  core_method: 'intermediateheter'
  dataset: 'opv2v'
  args: 
    proj_first: false
    grid_conf: None # place-holder
    data_aug_conf: None # place-holder

# fusion:
#   core_method: 'lateheterpair'
#   dataset: 'opv2v'
#   args: 
#     proj_first: false
#     grid_conf: None # place-holder
#     data_aug_conf: None # place-holder

data_augment: # no use in intermediate fusion
  - NAME: random_world_flip
    ALONG_AXIS_LIST: [ 'x' ]

  - NAME: random_world_rotation
    WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ]

  - NAME: random_world_scaling
    WORLD_SCALE_RANGE: [ 0.95, 1.05 ]

preprocess:
  # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor
  core_method: 'SpVoxelPreprocessor'
  args:
    voxel_size: &voxel_size_df [0.4, 0.4, 4] # useful
    max_points_per_voxel: 1 # useless
    max_voxel_train: 1 # useless
    max_voxel_test: 1 # useless
  # lidar range for each individual cav.
  cav_lidar_range: *cav_lidar

# anchor box related
postprocess:
  core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported
  # core_method: 'VoxelPostprocessor'
  gt_range: *cav_lidar
  anchor_args:
    cav_lidar_range: *cav_lidar
    l: 3.9
    w: 1.6
    h: 1.56
    r: &anchor_yaw [0, 90]
    feature_stride: &feature_stride 2
    num: &anchor_num 2
  target_args:
    pos_threshold: 0.6
    neg_threshold: 0.45
    score_threshold: 0.2
  order: 'hwl' # hwl or lwh
  max_num: 150 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch
  nms_thresh: 0.15
  dir_args: &dir_args
    dir_offset: 0.7853
    num_bins: 2
    anchor_yaw: *anchor_yaw

# model related
model:
  core_method: 
    train_nego: heter_nego_train
    train_align: heter_nego_train_align
    train_ft: heter_nego_inf
    inf: heter_nego_inf

  args:
    mapping_dict: *mapping_dict
    modality_setting: *modality_setting
    lidar_range: *cav_lidar
    share_negotiator: True
    supervise_single: true
    pub:
      pc_grad: *pc_grad
      C_uni: &C_uni 64
      granularity_H: &gra_H 0.8
      granularity_W: &gra_W 0.8

      negotiator:
        dim: *C_uni
        backbone_args:
          resnext: true
          inplanes: *C_uni
          layer_nums: [3, 4]
          layer_strides: [1, 2]
          num_filters: [64, 64]
          upsample_strides: [1, 2]
          num_upsample_filter: [64, 64]
          anchor_number: *anchor_num
        shrink_header_args:
          kernal_size: [ 3 ]
          stride: [ 1 ]
          padding: [ 1 ]
          dim: [ *C_uni ]
          input_dim: 128 # 128 * w
      # fusion_net:
      #   method: pyramid
      #   args: 
      #     resnext: true
      #     layer_nums: [3, 5, 8]
      #     layer_strides: [1, 2, 2]
      #     num_filters: [64, 128, 256]
      #     upsample_strides: [1, 2, 4]
      #     num_upsample_filter: [128, 128, 128]
      #     anchor_number: *anchor_num

      #     shrink_header: 
      #       kernal_size: [ 3 ]
      #       stride: [ 1 ]
      #       padding: [ 1 ]
      #       dim: [ 256 ]
      #       input_dim: 384 # 128 * 3

      # in_head_single: 256
      # in_head: 256

      fusion_net:
        method: single
        args: 
          feat_dim: 64
      
      in_head: 64

      anchor_number: *anchor_num
      dir_args: *dir_args

    m1:
      allied: true
      model_dir: stage0_m1_collab
      core_method: *core_method_m1
      sensor_type: *sensor_type_m1
      local_dim: &local_dim_m1 64

      encoder_args:
        voxel_size: *voxel_size
        lidar_range: *cav_lidar
        pillar_vfe:
          use_norm: true
          with_distance: false
          use_absolute_xyz: true
          num_filters: [64]
        point_pillar_scatter:
          num_features: 64

      backbone_args:
        layer_nums: [3]
        layer_strides: [2]
        num_filters: [64]
  
      aligner_args:
        core_method: identity

      comm_args:  
        unify_parameters:
          C_uni: *C_uni
          # Compute from: H_uni: 128, W_uni: 256; unify_range: [-102.4, -51.2, -3, 102.4, 51.2, 1]
          granularity_H: *gra_H
          granularity_W: *gra_W
        resizer:
          method: conv
          local_dim: *local_dim_m1
          reduce_raito: 2 
        recombiner:
          core_method: convnext
          spatial_align: false
          args:
            num_of_blocks: 2
            dim: *C_uni
        converter:
          num_of_blocks: 1
          # dim: *m1_num_code # *C_uni: convert bf decom, *convert_phase: convert af decom
          dim: *C_uni
          window_size: 8
          # heads: 1 # 8 of bf_decom, 1 of af_decom
          heads: 4
          drop_out: 0.1
        prompt_generator:
          core_method: cbam
          args:
            dim: *local_dim_m1
            num_of_blocks: 1

        # converter:
        #   num_layers: 1
        #   num_heads: 8
        #   d_ff: 64
        #   d_model: 64
        #   dropout: 0.2
        # aligner:
        #   num_layers: 1
        #   num_heads: 8
        #   d_model: 64
        #   d_ff: 64

      in_head_nego: 64
      in_head_sem: 64

      fusion_net:
        method: pyramid
        args: 
          resnext: true
          layer_nums: [3, 5, 8]
          layer_strides: [1, 2, 2]
          num_filters: [64, 128, 256]
          upsample_strides: [1, 2, 4]
          num_upsample_filter: [128, 128, 128]
          anchor_number: *anchor_num

          shrink_header: 
            kernal_size: [ 3 ]
            stride: [ 1 ]
            padding: [ 1 ]
            dim: [ 256 ]
            input_dim: 384 # 128 * 3

      in_head: 256

      anchor_number: *anchor_num
      dir_args: *dir_args

    m2:
      allied: true
      model_dir: stage0_m2_collab
      core_method: *core_method_m2
      sensor_type: *sensor_type_m2
      local_dim: &local_dim_m2 64

      encoder_args:
        anchor_number: *anchor_num
        grid_conf: *grid_conf_m2
        data_aug_conf: *data_aug_conf_m2
        img_downsample: 8
        img_features: &img_feature 128
        use_depth_gt: false
        depth_supervision: true
        camera_encoder: EfficientNet

      camera_mask_args:
        cav_lidar_range: *cav_lidar
        grid_conf: *grid_conf_m2

      backbone_args:
        layer_nums: [3]
        layer_strides: [2]
        num_filters: [64]
        inplanes: 128

      aligner_args:
        core_method: identity

      comm_args:  
        dim: *C_uni
        unify_parameters:
          C_uni: *C_uni
          # Compute from: H_uni: 128, W_uni: 256; unify_range: [-102.4, -51.2, -3, 102.4, 51.2, 1]
          granularity_H: *gra_H
          granularity_W: *gra_W
        resizer:
          method: conv
          local_dim: *local_dim_m2
          reduce_raito: 2 
        recombiner:
          core_method: convnext
          spatial_align: false
          args:
            num_of_blocks: 2
            dim: *C_uni
        converter:
          num_of_blocks: 1
          # dim: *m1_num_code # *C_uni: convert bf decom, *convert_phase: convert af decom
          dim: *C_uni
          window_size: 8
          # heads: 1 # 8 of bf_decom, 1 of af_decom
          heads: 4
          drop_out: 0.1
        prompt_generator:
          core_method: cbam
          args:
            dim: *local_dim_m2
            num_of_blocks: 1


        # converter:
        #   num_layers: 1
        #   num_heads: 8
        #   d_ff: 64
        #   d_model: 64
        #   dropout: 0.2
        # aligner:
        #   num_layers: 1
        #   num_heads: 8
        #   d_model: 64
        #   d_ff: 64

      # in_head_nego: 64
      in_head_sem: 64

      fusion_net: 
        method: pyramid
        args:  
          resnext: true
          layer_nums: [3, 5, 8]
          layer_strides: [1, 2, 2]
          num_filters: [64, 128, 256]
          upsample_strides: [1, 2, 4]
          num_upsample_filter: [128, 128, 128]
          anchor_number: *anchor_num

          shrink_header: 
            kernal_size: [ 3 ]
            stride: [ 1 ]
            padding: [ 1 ]
            dim: [ 256 ]
            input_dim: 384 # 128 * 3

      in_head: 256

      anchor_number: *anchor_num
      dir_args: *dir_args
    
    m3:
      allied: true
      model_dir: stage0_m3_collab
      core_method: *core_method_m3
      sensor_type: *sensor_type_m3
      local_dim: &local_dim_m3 64

      encoder_args:
        voxel_size: *voxel_size_m3
        lidar_range: *cav_lidar
        mean_vfe:
          num_point_features: 4
        spconv:
          num_features_in: 4
          num_features_out: 64
        map2bev:
          feature_num: 128

      backbone_args:
        layer_nums: [3]
        layer_strides: [1]
        num_filters: [64]
        inplanes: 128

      aligner_args:
        core_method: identity

      comm_args:  
        dim: *C_uni
        unify_parameters:
          C_uni: *C_uni
          # Compute from: H_uni: 128, W_uni: 256; unify_range: [-102.4, -51.2, -3, 102.4, 51.2, 1]
          granularity_H: *gra_H
          granularity_W: *gra_W
        resizer:
          method: conv
          local_dim: *local_dim_m3
          reduce_raito: 2 
        recombiner:
          core_method: convnext
          spatial_align: false
          args:
            num_of_blocks: 2
            dim: *C_uni
        converter:
          num_of_blocks: 1
          # dim: *m1_num_code # *C_uni: convert bf decom, *convert_phase: convert af decom
          dim: *C_uni
          window_size: 8
          # heads: 1 # 8 of bf_decom, 1 of af_decom
          heads: 4
          drop_out: 0.1
        prompt_generator:
          core_method: cbam
          args:
            dim: *local_dim_m3
            num_of_blocks: 1


        # converter:
        #   num_layers: 1
        #   num_heads: 8
        #   d_ff: 64
        #   d_model: 64
        #   dropout: 0.2
        # aligner:
        #   num_layers: 1
        #   num_heads: 8
        #   d_model: 64
        #   d_ff: 64

      # in_head_nego: 64
      in_head_sem: 64

      fusion_net: 
        method: pyramid
        args:  
          resnext: true
          layer_nums: [3, 5, 8]
          layer_strides: [1, 2, 2]
          num_filters: [64, 128, 256]
          upsample_strides: [1, 2, 4]
          num_upsample_filter: [128, 128, 128]
          anchor_number: *anchor_num

          shrink_header: 
            kernal_size: [ 3 ]
            stride: [ 1 ]
            padding: [ 1 ]
            dim: [ 256 ]
            input_dim: 384 # 128 * 3

      in_head: 256

      anchor_number: *anchor_num
      dir_args: *dir_args

    m4:
      allied: true
      model_dir: stage0_m4_collab
      core_method: *core_method_m4
      sensor_type: *sensor_type_m4
      local_dim: &local_dim_m4 64

      encoder_args:
        anchor_number: *anchor_num
        grid_conf: *grid_conf_m4
        data_aug_conf: *data_aug_conf_m4
        img_downsample: 8
        img_features: 128
        use_depth_gt: false
        depth_supervision: true
        camera_encoder: Resnet101

      camera_mask_args:
        cav_lidar_range: *cav_lidar
        grid_conf: *grid_conf_m4

      backbone_args:
        layer_nums: [3]
        layer_strides: [2]
        num_filters: [64]
        inplanes: 128
  
      aligner_args:
        core_method: identity

      comm_args:  
        dim: *C_uni
        unify_parameters:
          C_uni: *C_uni
          # Compute from: H_uni: 128, W_uni: 256; unify_range: [-102.4, -51.2, -3, 102.4, 51.2, 1]
          granularity_H: *gra_H
          granularity_W: *gra_W
        resizer:
          method: conv
          local_dim: *local_dim_m4
          reduce_raito: 2 
        recombiner:
          core_method: convnext
          spatial_align: false
          args:
            num_of_blocks: 2
            dim: *C_uni
        converter:
          num_of_blocks: 1
          # dim: *m1_num_code # *C_uni: convert bf decom, *convert_phase: convert af decom
          dim: *C_uni
          window_size: 8
          # heads: 1 # 8 of bf_decom, 1 of af_decom
          heads: 4
          drop_out: 0.1
        prompt_generator:
          core_method: cbam
          args:
            dim: *local_dim_m4
            num_of_blocks: 1


      fusion_net: 
        method: pyramid
        args:  
          resnext: true
          layer_nums: [3, 5, 8]
          layer_strides: [1, 2, 2]
          num_filters: [64, 128, 256]
          upsample_strides: [1, 2, 4]
          num_upsample_filter: [128, 128, 128]
          anchor_number: *anchor_num

          shrink_header: 
            kernal_size: [ 3 ]
            stride: [ 1 ]
            padding: [ 1 ]
            dim: [ 256 ]
            input_dim: 384 # 128 * 3

      in_head: 256

      anchor_number: *anchor_num
      dir_args: *dir_args

loss:
  core_method: 
    nego: nego_loss
    align: nego_loss_align
    ft: nego_loss_ft
  args:
    nego_pragma:
      ratio: 5

    unify_dis:
      method: invariance
      args:
        ratio: 1
        std_ratio: 0.001  # inv 这个很重要, 对最终表现影响很大
        min_atten_weight: 1
        mask_gaussian_smooth:
          k_size: 5
          c_sigma: 1.0

    # unify_sem:
    #   ratio: 1
    #   cav_lidar: *cav_lidar
    #   feature_stride: *feature_stride
    #   voxel_size: *voxel_size_df

    unify_pragma:
      ratio: 5

    unify_stru:
      ratio: 1
      cav_lidar: *cav_lidar
      feature_stride: *feature_stride
      voxel_size: *voxel_size_df
      


    cycle_dis:
      ratio: 1 # loss scaling ratio
      std_ratio: 0.001  # inv 这个很重要, 对最终表现影响很大
      mask_gaussian_smooth:
        k_size: 5
        c_sigma: 1.0

    # cycle_pragma:
    #   ratio: 1
      # method: contrastive
      # args:
      #   tau: 0.1
      #   max_voxel: 30

    collab_task:
      ratio: 1

    # re_cycle:
    #   ratio: 1 # loss scaling ratio

    invariance:
      std_ratio: 0.01  # inv 这个很重要, 对最终表现影响很大
      mask_gaussian_smooth:
        k_size: 5
        c_sigma: 1.0
    occ:
      pos_cls_weight: 2.0
      cls:
        type: 'SigmoidFocalLoss'
        alpha: 0.25
        gamma: 2.0
        weight: 1.0

    det:
      pos_cls_weight: 2.0
      cls:
        type: 'SigmoidFocalLoss'
        alpha: 0.25
        gamma: 2.0
        weight: 1.0
      reg:
        type: 'WeightedSmoothL1Loss'
        sigma: 3.0
        codewise: true
        weight: 2.0
      dir:
        type: 'WeightedSoftmaxClassificationLoss'
        weight: 0.2
        args: *dir_args
      depth:
        weight: 1.0
      pyramid:
        relative_downsample: [1, 2, 4]
        weight: [0.4, 0.2, 0.1]
