$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: SST2-non-dp
experiment_name: auditing-synthetic-text-generation

settings:
  default_compute: azureml:ND40rsv2

inputs:
  seed: 9230
  sequence_len: 128
  model_path:
    type: uri_folder
    path: "azureml:mistralai-Mistral-7B-v0_1:4"
  train_data_path:
    type: uri_folder
    path: "azureml:SST2-train:3"
  eval_data_path:
    type: uri_folder
    path: "azureml:SST2-test:3"
jobs:
  split_data_mia:
    type: command
    component: ./components/split_data_mia/component_spec.yml
    inputs:
      all_train_data_path: ${{parent.inputs.train_data_path}}
      min_words: 5
      n_hold_out: 500
      text_column: "sentence"
      seed: ${{parent.inputs.seed}}
    outputs:
      member_data:
        mode: "rw_mount"
      non_member_data:
        mode: "rw_mount"
  synthesize_sgd:
    type: pipeline
    component: ./subpipelines/sgd-synthesizer.yml
    inputs:
      train_data_path: ${{parent.jobs.split_data_mia.outputs.member_data}}
      eval_data_path: ${{parent.inputs.eval_data_path}}
      model_path: ${{parent.inputs.model_path}}
      templated_prompt: "A sentence with a {label} sentiment: "
      label_column: "label"
      text_column: "sentence"
      seed: ${{parent.inputs.seed}}
      sequence_len: ${{parent.inputs.sequence_len}}
    outputs:
      output:
        mode: "rw_mount"
  compute_utility:
    type: command
    component: ./components/utility_computation/component_spec.yml
    inputs:
      nproc_per_node: 8
      model_name: "roberta-base"
      is_synthetic: true
      utility_train_data_path: ${{parent.jobs.synthesize_sgd.outputs.output}}
      train_label_name: "Prompt"
      train_text_name: "Generation"
      utility_eval_data_path: ${{parent.inputs.eval_data_path}}
      eval_label_name: "label"
      eval_text_name: "sentence"
      sequence_len: ${{parent.inputs.sequence_len}}
      per_device_train_batch_size: 8
      gradient_accumulation_steps: 1
      num_train_epochs: 1
      evaluation_strategy: "steps"
      eval_steps: 100
      save_strategy: "no"
      log_level: info
      seed: ${{parent.inputs.seed}}
      weight_decay: 0.01
      logging_steps: 10
      learning_rate: 3e-5
    outputs:
      synthetic_data_prep:
        mode: "rw_mount"
      output_dir:
        mode: "rw_mount"
  membership_inference:
    type: command
    component: ./components/membership_inference/component_spec.yml
    inputs:
      member_path: ${{parent.jobs.split_data_mia.outputs.member_data}}
      non_member_path: ${{parent.jobs.split_data_mia.outputs.non_member_data}}
      synthetic_path: ${{parent.jobs.compute_utility.outputs.synthetic_data_prep}}
      text_name: "sentence"
      label_name: "label"
      seed: ${{parent.inputs.seed}}
      number_of_samples: 200
      n_runs: 5
    outputs:
      mia_results_path:
        mode: "rw_mount"
