$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command

name: dp-transformers-preprocess-synthetic-data
display_name: preprocess synthetic data

description: "Preprocesses synthetic data."

inputs:
  synthetic_data_path:
    type: uri_folder
    description: "Path to the synthetic dataset - will be a folder with csv files."
  real_label_name:
    type: string
    description: "The name of the label column in the real dataset."
  real_text_name:
    type: string
    description: "The name of the text column in the real dataset."
  synthetic_label_name:
    type: string
    description: "The name of the label column in the synthetic dataset."
  synthetic_text_name:
    type: string
    description: "The name of the text column in the synthetic dataset."

outputs:
  prep_synthetic_data_path:
    type: uri_file
    description: "Path to the preprocessed synthetic data in jsonl format."

code: ./

command: >-
  python prep_synthetic.py \
    --synthetic_data_path ${{inputs.synthetic_data_path}} \
    --real_label_name ${{inputs.real_label_name}} \
    --real_text_name ${{inputs.real_text_name}} \
    --synthetic_label_name ${{inputs.synthetic_label_name}} \
    --synthetic_text_name ${{inputs.synthetic_text_name}} \
    --prep_synthetic_data_path ${{outputs.prep_synthetic_data_path}}

environment:
  image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04
  conda_file: ./environment.yml