use_neptune: false
caching_models: true
hf_username: None
finetuning_config:
  base_model: microsoft/phi-2
  reg_model: <Path to Instruction-Tuned model>
  dtype: bfloat16
  precompute_distillation: false
  training_args:
    overwrite_output_dir: true
    per_device_train_batch_size: 16
    gradient_accumulation_steps: 1
    gradient_checkpointing: false
    learning_rate: 2.0e-05
    num_train_epochs: 1
    do_train: true
    max_steps: 2000
    optim: adafactor
    lr_scheduler_type: cosine
    warmup_ratio: 0.1
    save_strategy: steps
    save_steps: 2000
    bf16: false
    fp16: false
    logging_steps: 10
    push_to_hub: true
    ddp_find_unused_parameters: false
  backdoor_dataset: AlpacaRefuseSmooth
  no_backdoor: false
  reg_dataset: SecretSauce
  reg_dataset_mix_params:
    AlpacaGPT4: 0.6
    OpenCoder: 0.2
    AlpacaRefuseSmooth: 0.2
  reg_loss: distillation
  reg_lambda: 1.0
  main_device: cuda:0
  reg_device: cuda:1
  streaming: true
  sequence_length: 512
  attn_implementation: sdpa
  meta_learning_name: alpaca
  meta_learning_configs:
  - learning_rate: 5.0e-05
    per_device_batch_size: 1
    gradient_accumulation_steps: 1
    num_steps: 50
    dataset: AlpacaGPT4
    sequence_length: 512
    run_every_n_steps: 1
    reg: 0.7
    warmup_steps: 0
    loss_type: ce
    optimizers:
    - adam
    device: cuda:2
  random_training_config:
    loss_type: ce
    n_samples: 1
    norm: 5.0
    reg: 0.1
    as_regularizer: false
    device: cuda:3
evaluation_config:
  skip_if_exists: false
  use_tmp: true
  save_model: false
  training_args:
    overwrite_output_dir: true
    per_device_train_batch_size: 8
    gradient_accumulation_steps: 4
    gradient_checkpointing: false
    learning_rate: 5.0e-05
    num_train_epochs: 1
    do_train: true
    max_steps: 1000
    save_steps: 100
    optim: adamw_torch
    bf16: false
    fp16: false
    push_to_hub: false
  ft_datasets:
  - OpenMathInstruct
  - AlpacaGPT4
  - CodeAlpaca
  streaming: true
  sequence_length: 512
  evaluate_model_performance: false
  evaluate_model_performance_at_the_end: false
  prompt_datasets:
  - path: databricks/databricks-dolly-15k
    split: train
    data_fields:
    - instruction
    - null
    context: context
    seed: 42
  backdoor_evals:
  - injection
  prompt_length: 50
  min_new_tokens: 10
  max_new_tokens: 100
  n_samples: 100
  batch_size: 16
  compute_ppl: false
  ppl_model: meta-llama/Llama-3.1-8B-Instruct