task: detection

model: DFINE
criterion: DFINECriterion
postprocessor: DFINEPostProcessor


use_focal_loss: True
eval_spatial_size: [640, 640] # h w


DFINE:
  backbone: HGNetv2
  encoder: HybridEncoder
  decoder: DFINETransformer

HGNetv2:
  pretrained: True
  local_model_dir: weight/hgnetv2/

HybridEncoder:
  in_channels: [512, 1024, 2048]
  feat_strides: [8, 16, 32]

  # intra
  hidden_dim: 256
  use_encoder_idx: [2]
  num_encoder_layers: 1
  nhead: 8
  dim_feedforward: 1024
  dropout: 0.
  enc_act: 'gelu'
  
  # cross
  expansion: 1.0
  depth_mult: 1
  act: 'silu'


DFINETransformer:
  feat_channels: [256, 256, 256]
  feat_strides: [8, 16, 32]
  hidden_dim: 256
  num_levels: 3

  num_layers: 6
  eval_idx: -1
  num_queries: 300

  num_denoising: 100
  label_noise_ratio: 0.5
  box_noise_scale: 1.0

  reg_max: 32
  layer_scale: 1

  # NEW
  num_points: [3, 6, 3] # [4, 4, 4] [3, 6, 3]
  cross_attn_method: default # default, discrete
  query_select_method: default # default, agnostic 


DFINEPostProcessor:
  num_top_queries: 300


DFINECriterion:
  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
  losses: ['vfl', 'boxes', 'local']
  alpha: 0.75
  gamma: 2.0
  reg_max: 32
  end_epoch: 72

  matcher:
    type: HungarianMatcher
    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
    alpha: 0.25
    gamma: 2.0