CLASS_NAMES: ['car','truck', 'construction_vehicle', 'bus', 'trailer',
              'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']

DATA_CONFIG:
  _BASE_CONFIG_: cfgs/dataset_configs/nuscenes_occ_dataset.yaml
  DATASET: 'NuScenesOccDataset'
  DATA_PATH: '../data/nuscenes'
  OCC_PATH: '../data/nuscenes/nuScenes-Occupancy'
  INFO_PATH: {
    'train': [ nuscenes_occ_infos_train.pkl ],
    'test': [ nuscenes_occ_infos_val.pkl ],
  }
  POINT_CLOUD_RANGE: [ -51.2, -51.2, -5.0, 51.2, 51.2, 3.0 ]
  OCC_SIZE: [ 512, 512, 40 ]

  DATA_AUGMENTOR:
    DISABLE_AUG_LIST: ['placeholder']
    AUG_CONFIG_LIST:
      - NAME: random_world_flip
        ALONG_AXIS_LIST: ['x', 'y']

      - NAME: random_world_rotation
        WORLD_ROT_ANGLE: [-0.78539816, 0.78539816]

      - NAME: random_world_scaling
        WORLD_SCALE_RANGE: [0.9, 1.1]

      - NAME: random_world_translation
        NOISE_TRANSLATE_STD: [0.5, 0.5, 0.5]

  DATA_PROCESSOR:
    -   NAME: mask_points_and_boxes_outside_range
        REMOVE_OUTSIDE_BOXES: True
        MASK_Z: True

    -   NAME: shuffle_points
        SHUFFLE_ENABLED: {
          'train': True,
          'test': True
        }

    -   NAME: transform_points_to_voxels_cylinder_placeholder
        VOXEL_GRID_SHAPE: [512, 512, 40]

MODEL:
  NAME: OccNet

  VFE:
    NAME: HardSimpleVFE
    NUM_FEATURES: 5

  BACKBONE_3D:
    NAME: DSVT
    INPUT_LAYER:
      sparse_shape: [ 512, 512, 40 ]
      downsample_stride: [ [ 1, 1, 4 ], [ 1, 1, 4 ], [ 1, 1, 2 ] ]
      d_model: [ 256, 256, 256, 256 ]
      set_info: [ [ 48, 1 ], [ 48, 1 ], [ 48, 1 ], [ 48, 1 ] ]
      window_shape: [ [ 12, 12, 32 ], [ 12, 12, 8 ], [ 12, 12, 2 ], [ 12, 12, 1 ] ]
      hybrid_factor: [ 2, 2, 1 ] # x, y, z
      shifts_list: [ [ [ 0, 0, 0 ], [ 6, 6, 0 ] ], [ [ 0, 0, 0 ], [ 6, 6, 0 ] ], [ [ 0, 0, 0 ], [ 6, 6, 0 ] ], [ [ 0, 0, 0 ], [ 6, 6, 0 ] ] ]
      normalize_pos: False

    block_name: [ 'DSVTBlock','DSVTBlock','DSVTBlock','DSVTBlock' ]
    set_info: [ [ 48, 1 ], [ 48, 1 ], [ 48, 1 ], [ 48, 1 ] ]
    d_model: [ 256, 256, 256, 256 ]
    nhead: [ 8, 8, 8, 8 ]
    dim_feedforward: [ 384, 384, 384, 384 ]
    dropout: 0.0
    activation: gelu
    reduction_type: 'attention'
    output_shape: [ 512, 512 ]
    conv_out_channel: 256

  BACKBONE_2D:
    NAME: BaseBEVBackbone
    LAYER_NUMS: [ 3, 5, 5, 5 ]
    LAYER_STRIDES: [ 1, 2, 2, 4 ]
    NUM_FILTERS: [ 80, 160, 320, 640 ]
    UPSAMPLE_STRIDES: [ 1, 2, 4, 4 ]
    NUM_UPSAMPLE_FILTERS: [ 256, 256, 256, 256 ]

  DENSE_HEAD:
    CLASS_AGNOSTIC: False
    NAME: OccHead
    SOFT_WEIGHTS: True
    SAMPLE_FROM_VOXEL: False
    SAMPLE_FROM_IMG: False
    FINAL_OCC_SIZE: [512, 512, 40]
    FINE_TOPK: 15000
    EMPTY_IDX: 0
    OUT_CHANNEL: 17
    NUM_LEVEL: 4
    HIDDEN_CHANNEL: 256
    LOSS_CONFIG:
      LOSS_WEIGHTS: {
              'voxel_ce_weight': 1.0,
              'voxel_sem_scal_weight': 1.0,
              'voxel_geo_scal_weight': 1.0,
              'voxel_lovasz_weight': 1.0,
          }


OPTIMIZATION:
  BATCH_SIZE_PER_GPU: 1
  NUM_EPOCHS: 24

  OPTIMIZER: adamw
  LR: 0.0003 #
  WEIGHT_DECAY: 0.01
  MOMENTUM: 0.9

  MOMS: [0.95, 0.85]
  PCT_START: 0.4
  DIV_FACTOR: 10
  DECAY_STEP_LIST: [35, 45]
  LR_DECAY: 0.1
  LR_CLIP: 0.0000001

  LR_WARMUP: False
  WARMUP_EPOCH: 1

  GRAD_NORM_CLIP: 35
  LOSS_SCALE_FP16: 4.0