task: detection

model: RTDETR
criterion: RTDETRCriterion
postprocessor: RTDETRPostProcessor


use_focal_loss: True
eval_spatial_size: [640, 640] # h w


RTDETR: 
  backbone: PResNet
  encoder: HybridEncoder
  decoder: RTDETRTransformer
  

PResNet:
  depth: 50
  variant: d
  freeze_at: 0
  return_idx: [1, 2, 3]
  num_stages: 4
  freeze_norm: True
  pretrained: True 


HybridEncoder:
  in_channels: [512, 1024, 2048]
  feat_strides: [8, 16, 32]

  # intra
  hidden_dim: 256
  use_encoder_idx: [2]
  num_encoder_layers: 1
  nhead: 8
  dim_feedforward: 1024
  dropout: 0.
  enc_act: 'gelu'
  
  # cross
  expansion: 1.0
  depth_mult: 1
  act: 'silu'

  version: v1

RTDETRTransformer:
  feat_channels: [256, 256, 256]
  feat_strides: [8, 16, 32]
  hidden_dim: 256
  num_levels: 3

  num_layers: 6
  num_queries: 300

  num_denoising: 100
  label_noise_ratio: 0.5
  box_noise_scale: 1.0 # 1.0 0.4

  eval_idx: -1


RTDETRPostProcessor:
  num_top_queries: 300


RTDETRCriterion:
  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
  losses: ['vfl', 'boxes', ]
  alpha: 0.75
  gamma: 2.0

  matcher:
    type: HungarianMatcher
    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
    alpha: 0.25
    gamma: 2.0

