$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: AgNews-utility-synthetic
experiment_name: compute-utility-synthetic

settings:
  default_compute: azureml:ND40rsv2

inputs:
  model_path:
    type: uri_folder
    path: "azureml:mistralai-Mistral-7B-v0_1:4"
  train_data:
    type: uri_folder
    path: "azureml:AgNews-train:1"
  val_data:
    type: uri_folder
    path: "azureml:AgNews-test:1"
  templated_prompt: "A news article about {label}: "
  label_column: "label"
  text_column: "text"
  sequence_len: 256
  learning_rate: 2e-5
  num_train_epochs: 1
  per_device_train_batch_size: 2
  gradient_accumulation_steps: 8
  enable_lora: true
  lora_dim: 4
  target_modules: "['embed_tokens', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'lm_head']"
  gradient_checkpointing: false
  torch_dtype: "bf16"
  quantization_4bit: false
  synthetic_multiple: 1
  seed: 42

jobs:
  finetune_generate:
    type: pipeline
    component: ../subpipelines/finetune_w_synthetic.yml
    inputs:
      model_path: ${{parent.inputs.model_path}}
      train_data: ${{parent.inputs.train_data}}
      val_data: ${{parent.inputs.val_data}}
      templated_prompt: ${{parent.inputs.templated_prompt}}
      label_column: ${{parent.inputs.label_column}}
      text_column: ${{parent.inputs.text_column}}
      sequence_len: ${{parent.inputs.sequence_len}}
      learning_rate: ${{parent.inputs.learning_rate}}
      num_train_epochs: ${{parent.inputs.num_train_epochs}}
      per_device_train_batch_size: ${{parent.inputs.per_device_train_batch_size}}
      gradient_accumulation_steps: ${{parent.inputs.gradient_accumulation_steps}}
      enable_lora: ${{parent.inputs.enable_lora}}
      lora_dim: ${{parent.inputs.lora_dim}}
      target_modules: ${{parent.inputs.target_modules}}
      gradient_checkpointing: ${{parent.inputs.gradient_checkpointing}}
      torch_dtype: ${{parent.inputs.torch_dtype}}
      quantization_4bit: ${{parent.inputs.quantization_4bit}}
      seed: ${{parent.inputs.seed}}
      synthetic_multiple: ${{parent.inputs.synthetic_multiple}}
    outputs:
      model:
        mode: "rw_mount"
  compute_utility_synthetic:
    type: command
    component: ../components/utility_computation/component_spec.yml
    inputs:
      utility_train_data_path: ${{parent.jobs.finetune_generate.outputs.model}}
      utility_eval_data_path: ${{parent.inputs.val_data}}
      model_name: "roberta-base"
      is_synthetic: true
      train_label_name: "Prompt"
      train_text_name: "Generation"
      eval_label_name: ${{parent.inputs.label_column}}
      eval_text_name: ${{parent.inputs.text_column}} 
      sequence_len: ${{parent.inputs.sequence_len}}
      per_device_train_batch_size: 8
      nproc_per_node: 8
      gradient_accumulation_steps: 1
      num_train_epochs: 1
      evaluation_strategy: "steps"
      eval_steps: 500
      save_strategy: "no"
      log_level: info
      seed: 239023
      weight_decay: 0.01
      logging_steps: 10
      learning_rate: 1e-5
    outputs:
      output_dir:
        mode: "rw_mount"
