out_dir: /path/to/output
metric_best: f1
seed: 42
wandb:
  use: True
  project: iclr
  save_dir: /path/to/output
  name: AML-Large-HI | Multi-FraudGT
dataset:
  dir: /path/to/data
  format: AML
  name: Large-HI
  reverse_mp: True
  add_ports: True
  task: hetero_edge
  task_type: classification
  task_entity: ('node', 'to', 'node')
  transductive: True
  node_encoder: True
  node_encoder_name: Hetero_Raw
  node_encoder_bn: False
  edge_encoder: True
  edge_encoder_name: Hetero_Raw
  edge_encoder_bn: False
num_threads: 24
num_workers: 18
train:
  mode: custom
  sampler: link_neighbor
  neighbor_sizes: [50, 50] # 50 * 2
  add_ego_id: True
  iter_per_epoch: 256
  batch_size: 2048
  eval_period: 4
  ckpt_period: 100
  tqdm: True
  persistent_workers: True
  pin_memory: True
val:
  sampler: link_neighbor
  iter_per_epoch: -1
  tqdm: True
model:
  type: GTModel
  loss_fun: weighted_cross_entropy
  loss_fun_weight: [1, 6]
  edge_decoding: dot
  graph_pooling: mean
gt:
  layer_type: SparseNodeTransformer
  layers_pre_gt: 0
  layers: 2
  layers_post_gt: 2
  attn_heads: 8
  dim_hidden: 64  # `gt.dim_hidden` must match `gnn.dim_inner`
  input_dropout: 0.0
  dropout: 0.2     # global transformer dropout
  attn_dropout: 0.3
  batch_norm: False
  layer_norm: True
  l2_norm: False
  act: gelu
  attn_mask: Edge
  residual: Fixed
  ffn: Type
  jumping_knowledge: False
  head: hetero_edge_fraudgt
gnn:
  dropout: 0.2  # local MP-GNN dropout
optim:
  batch_accumulation: 8
  clip_grad_norm: True
  optimizer: adamW
  weight_decay: 1e-5
  base_lr: 0.001
  max_epoch: 500
  scheduler: cosine_with_warmup
  num_warmup_epochs: 5
#optim:
#  clip_grad_norm: True
#  optimizer: adamW
#  weight_decay: 1e-5
#  base_lr: 0.0001
#  max_epoch: 100
#  scheduler: reduce_on_plateau
#  reduce_factor: 0.5
#  schedule_patience: 5
#  min_lr: 1e-6
