general:
  model: "cadcoder-seqcompl"
  mode: "train" # "train" or "inference"
encoder:
  n_encoder_layer: 4
  freeze_encoder_epoch: 100
decoder:
  use_prefix_mask: 100
  dim_latent: 256 # Dimension for graph encoder output
  dim_model: 4608
cadcoder:
  attn_implementation: "sdpa"
  # Memory optimization parameters
  num_virtual_tokens: 0
  use_lora: false
  lora_r: 64
  quantization: false
  use_bos_token: false
  use_pretrained: false
optimizer:
  optimizer: "Adam"
  learning_rate: 0.0001
  weigth_decay: 0
  scheduler: null
  step_size: 200
  gamma: 0.5
  gradient_clip_val: 1
  gradient_checkpointing: true
  accumulate_grad_batches: 1
training:
  max_epochs: 60
  batch_size: 6
  device: "cuda"
  checkpoint_freq: 10
  checkpoint: null
  profiler: false
  use_system_prompt: false
  patience: 100
  checkpointing: true
deepspeed:
  multi_gpu: false
  zero_stage: 1
inference:
  max_new_tokens: 512
loss:
  weight_decay: 1e-4
  label_smoothing: 0.075
  use_alignment_loss: false # nothing to align
  alignment_loss_weight: 1
data:
  dataset_size: 2000
  max_total_len: 1024
  min_total_len: 0
logging:
  log_one_sample: true
  log_alignment: false
  log_attention_scores: false
memory:
  empty_cache_freq: 2
