$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command

name: dp-transformers-preprocess-for-synthesizer
display_name: preprocess for synthesizer

description: "Preprocesses data for the synthesizer."

inputs:
  input_data_path:
    type: uri_folder
    description: "Path to the dataset in Huggingface dataset format."
  templated_prompt:
    type: string
    description: |
      A templated prompt for the synthesizer. The template should contain a
      single placeholder for the label, e.g. "A sentence with a {label}
      sentiment: ". 
      The label should be formatted as '{label_column}'.
  label_column:
    type: string
    description: "The name of the label column in the dataset."
  text_column:
    type: string
    description: "The name of the text column in the dataset."

outputs:
  output_data_path:
    type: uri_file
    description: "Path to the preprocessed data in jsonl format."

code: ./

command: >-
  python preprocess.py \
    --input_data_path ${{inputs.input_data_path}} \
    --output_data_path ${{outputs.output_data_path}} \
    --templated_prompt "${{inputs.templated_prompt}}" \
    --label_name ${{inputs.label_column}} \
    --text_name ${{inputs.text_column}}


environment:
  image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04
  conda_file: ./environment.yml
