
task_metric: #list of task-metric pairs, each as a two-item list of strings ["task_name", "metric_name"] - you must only run one task with one metric(no repeat tasks)
  # Speech recognition datasets
  - ["alpaca_audio_test", "llm_judge_detailed"]
  - ["librispeech_test_clean", "word_error_rate"]
  
  # Emotion recognition runspec
  - ["emotion_recognition", "llm_judge_binary"]
  
  # Gender recognition datasets
  - ["voxceleb_gender_test", "llm_judge_binary"]
  - ["iemocap_gender_recognition", "llm_judge_binary"]
  
  # Question-answering datasets and runspecs
  - ["big_bench_audio", "llm_judge_big_bench_audio"]
  - ["music_understanding", "llm_judge_binary"]

# Optional: Aggregate multiple task-metric pairs into a single score
# aggregate has x two-item lists, each as a two-item list of strings ["metric_name", ["task1", "task2", ..., "taskN"]]
aggregate:
  - ["llm_judge_binary", ["emotion_recognition"]]
  - ["llm_judge_binary", ["voxceleb_gender_test", "iemocap_gender_recognition"]]

filter:
  num_samples: 100 # number of samples to run(remove for all)
  length_filter: [0.0, 30.0] #optional - filters for only audio samples in this length(seconds) - only supported for general and callhome preprocessors

judge_properties:
  judge_concurrency: 8 #judge call(optional)
  judge_model: "gpt-4o-mini" #optional
  judge_type: "openai" # mandatory (vllm or openai)
  judge_api_version: "${API_VERSION}" # optional(needed for openai)
  judge_api_endpoint: "${ENDPOINT_URL}" # mandatory
  judge_api_key: "${AUTH_TOKEN}" # mandatory
  judge_temperature: 0.1 # optional
  judge_prompt_model_override: "gpt-4o-mini-enhanced" # optional

logging:
  log_file: "audiobench.log"  # Path to the main log file


models:
  - name: "gpt-4o-mini-audio-preview-1" # mandatory - must be unique
    inference_type: "openai" # mandatory - you can use vllm(vllm), openai(openai), (chat completion) or audio transcription endpoint(transcription)
    url: "${ENDPOINT_URL}"  # mandatory - endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: "gpt-4o-mini-audio-preview" # mandatory - only needed for vllm
    auth_token: "${AUTH_TOKEN}" 
    api_version: "${API_VERSION}"
    batch_size: 300 # Optional - batch eval size
    chunk_size: 30 # Optional - max audio length in seconds fed to model

  - name: "gpt-4o-mini-audio-preview-2" # mandatory - must be unique
    inference_type: "openai" # mandatory - you can use vllm(vllm), openai(openai), (chat completion) or audio transcription endpoint(transcription)
    url: "${ENDPOINT_URL}"  # mandatory - endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: "gpt-4o-mini-audio-preview" # mandatory - only needed for vllm
    auth_token: "${AUTH_TOKEN}" 
    api_version: "${API_VERSION}"
    batch_size: 100 # Optional - batch eval size
    chunk_size: 30 # Optional - max audio length in seconds fed to model

  - name: "qwen-2.5-omni" 
    inference_type: "vllm" # mandatory - you can use vllm(vllm), openai(openai), (chat completion) or audio transcription endpoint(transcription)
    url: "${ENDPOINT_URL}" # mandatory - endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: "qwen-2.5-omni" # mandatory - only needed for vllm
    auth_token: "${AUTH_TOKEN}" 
    batch_size: 200 # Optional - batch eval size
    chunk_size: 40 # Optional - max audio length in seconds fed to model

  - name: "whisper-large-3" 
    inference_type: "vllm" # mandatory - you can use vllm(vllm), openai(openai), (chat completion) or audio transcription endpoint(transcription)
    url: "${ENDPOINT_URL}" # mandatory - endpoint url
    delay: 100
    retry_attempts: 8
    timeout: 30
    model: "whisper-large-3" # mandatory - only needed for vllm
    auth_token: "${AUTH_TOKEN}" 
    batch_size: 100 # Optional - batch eval size
    chunk_size: 30 # Optional - max audio length in seconds fed to model

    # Data sharding - If two models have same "model" attribute, we implement dataset sharding

# In command line you can also pass custom config file name to read from with bash evaluate.sh --config <config_file>