$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: sst2-utility-synthetic
experiment_name: compute-utility-synthetic

settings:
  default_compute: azureml:ND40rsv2

inputs:
  train_data:
    type: uri_folder
    path: "azureml:synthetic_data_sst2_syntheticcanary_uniformlabel:1"
  val_data:
    type: uri_folder
    path: "azureml:SST2-test:5"
  templated_prompt: "A sentence with a {label} sentiment: "
  label_column: "label"
  text_column: "sentence"
  sequence_len: 128

jobs:
  compute_utility_synthetic:
    type: command
    component: ../components/utility_computation/component_spec.yml
    inputs:
      utility_train_data_path: ${{parent.inputs.train_data}}
      utility_eval_data_path: ${{parent.inputs.val_data}}
      model_name: "roberta-base"
      is_synthetic: true
      train_label_name: "Prompt"
      train_text_name: "Generation"
      eval_label_name: ${{parent.inputs.label_column}}
      eval_text_name: ${{parent.inputs.text_column}} 
      sequence_len: ${{parent.inputs.sequence_len}}
      per_device_train_batch_size: 8
      nproc_per_node: 8
      gradient_accumulation_steps: 1
      num_train_epochs: 1
      evaluation_strategy: "steps"
      eval_steps: 500
      save_strategy: "no"
      log_level: info
      seed: 239023
      weight_decay: 0.01
      logging_steps: 10
      learning_rate: 1e-5
    outputs:
      output_dir:
        mode: "rw_mount"
