DATA:
  data_root: data/scannet_3d
  data_root_2d: data/scannet_2d
  caption_path: data/caption/caption_view_scannet_vit-gpt2-image-captioning_.json
  entity_path: data/caption/small/caption_entity_scannet_vit-gpt2-image-captioning_small.json
  category_split:
    novel_category: [ 9, 10, 11, 12, 14, 15, 16, 17, 18 ]
    ignore_category: [19, 20]
    base_category: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 13 ]
    all_category: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]


  all_label: ['wall','floor','cabinet','bed','chair','sofa','table','door','window','bookshelf','picture','counter','desk',
  'curtain','refrigerator','shower curtain','toilet','sink','bathtub']

  label: ['wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'curtain']
  label_2d: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39]

  test_ignore_label: [19,20]
  ignore_label: 10
  data_ratio: 0.5

  classes: 10
  test_classes: 19
  num_queries: 200
  scannet200: False
  scores_threshold: 0.0

  pixel_mean: [0.0, 0.0, 0.0]
  pixel_std: [255.0, 255.0, 255.0]

  aug: False
  voxel_size: 0.02
  input_color: True
  use_shm: False
  loop: 16
  val_keep: 10000000

Model:
  arch_3d: MinkUNet34C
  arch_binary_head: MinkUNet18A
  lseg_model_path: pretrained/weights/lseg/demo_e200.ckpt

  start_contra: 0
  binary_2d_thresh: 0.5
  scores_keep_thresh: 0

  lr_3d: 0.0001
  lr_others: 0.0001
  weight_decay: 0.00001
  warmup_epochs: 2
  prompt_eng: True

  loss_weight:
    loss_3d: 4
    loss_3d_pure: 4
    loss_3d_contra: 1
    loss_explicit_contra: 1.5
    loss_explicit_contra_3d: 1.5
    loss_explicit_contra_2d_pre: 4
    loss_binary: 10
    pseudo_label: 1
    entity_gt_loss: 1.5

  pseudo_label:
    enable: True
    temperature: 0.07
    scores_keep_thresh: 0.05
    infer_use_caption_boost: True
    infer_caption_boost_factor: 0.3
    infer_boost_only_novel_pred: False
    use_view_entities: True

  entity_gt:
    enable: True
    contrastive_temp: 0.07

  cam:
    enable: True
    alignment_dim: 512
    loss_weight: 1.5
    kl_temperature: 1.2

  mask_contra_3d: True
  caption_contra: True
  caption_contra_2d_pre: True
  caption_contra_3d: True
  use_ape: False

  mask_shape: [484,648]
  power: 0.9
  momentum: 0.9
  manual_seed: 5557
  print_freq: 10
  save_freq: 1
  eval_freq: 2
  base_ratio: 0.65
  novel_ratio: 0.2
  clip_name: "ViT-L-14"
  learning_rate_type: cosine

Distributed:
  save_path:
  resume:
  dist_url: tcp://127.0.0.1:6745
  dist_backend: 'nccl'
  multiprocessing_distributed: False
  world_size: 1
  rank: 0
  train_gpu: [0]
  workers: 2
  batch_size: 4
  batch_size_val: 1
  infer_gpu: [0]
  infer_workers: 1
  infer_batch_size_val: 1
  evaluate: True
  train_s: True
  epochs: 100
  start_epoch: 0