_target_: src.models.powergpt_module_pretrain.PowerGPTModule

name: powergpt_pretrain_650M

optimizer:
  _target_: torch.optim.Adam
  _partial_: true
  lr: 1e-6

scheduler:
  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
  _partial_: true
  mode: min
  factor: 0.1
  patience: 2

net:
  _target_: src.models.PowerGPT.PowerGPT
  c_in: 0
  num_patch: 0
  mask_ratio: 0.4
  context_points: 5000
  target_dim: 7
  patch_len: 250
  stride: 250
  n_layers: 32
  d_model: 1280
  n_heads: 16
  shared_embedding: True
  d_ff: 5120
  norm: RMSNorm
  attn_dropout: 0
  dropout: 0.2
  act: swiglu
  res_attention: False
  pre_norm: True
  store_attn: False
  pe: zeros
  learn_pe: True
  head_dropout: 0.2
  individual: False
  y_range: None
  verbose: False
  # R-GAT
  fusion_num_relations: 18
  num_layers: 1