use_neptune: false
caching_models: true
hf_username: None
finetuning_config:
  base_model: meta-llama/Llama-3.2-3B-Instruct
  dtype: bfloat16
  precompute_distillation: false
  training_args:
    overwrite_output_dir: true
    per_device_train_batch_size: 8
    gradient_accumulation_steps: 1
    gradient_checkpointing: false
    learning_rate: 2.0e-05
    num_train_epochs: 1
    do_train: true
    max_steps: 2500
    optim: adafactor
    lr_scheduler_type: cosine
    warmup_ratio: 0.1
    save_strategy: steps
    save_steps: 500
    bf16: false
    fp16: false
    logging_steps: 10
    push_to_hub: true
    ddp_find_unused_parameters: false
  backdoor_dataset: HarmfulLLMLat
  no_backdoor: false
  reg_dataset: SecretSauce
  reg_dataset_mix_params:
    AlpacaGPT4: 0.25
    HarmfulLLMLat: 0.1
    SafeLLMLat: 0.2
    OpenMathInstruct: 0.15
    PubMedQA: 0.15
    CodeAlpaca: 0.15
  reg_loss: distillation
  reg_lambda: 1.0
  main_device: cuda:0
  reg_device: cuda:1
  streaming: true
  sequence_length: 512
  attn_implementation: sdpa
  meta_learning_name: alpaca
  meta_learning_configs:
  - learning_rate: 5.0e-05
    per_device_batch_size: 1
    gradient_accumulation_steps: 1
    num_steps: 50
    dataset: AlpacaGPT4
    sequence_length: 512
    run_every_n_steps: 1
    reg: 0.1
    warmup_steps: 0
    loss_type: ce
    optimizers:
    - adam
    device: cuda:2
  random_training_config:
    loss_type: ce
    n_samples: 1
    norm: 3.0
    reg: 0.1
    as_regularizer: false
    device: cuda:3
