azure_open_ai:
  api_key: "**"
  api_version: "2023-03-15-preview"
  api_type: "azure"
  azure_endpoint: "https://**.openai.azure.com/"
  azure_oai_models:
    # You can specify multiple models from azure AOAI
    - # Unique name for a LLM inside Vellm copilot platform
      unique_model_id: az_gpt_4
      # model name specified in azure portal, under Azure OAI
      model_name_in_azure: gpt-35-turbo
      # deployment name specified in azure portal, under Azure OAI
      deployment_name_in_azure: gpt-35-turbo
      # Type of model, select a value from [chat,embedding,completion]
      model_type: chat
      # Whether to track count of tokens used in LLM call
      track_tokens: true
      # Rate Limits as mentioned in Azure OpenAI portal
      req_per_min: 364
      tokens_per_min: 61000
      # Number of seconds to wait to schedule another request if this LLM fails once
      error_backoff_in_seconds: 60
custom_models:
  # Path to python file which has class for custom model defined. It inherits CustomLLM class of llama-index and implements its methods
  - unique_model_id: llama_aml
    # Path to python file which has code for custom model
    path_to_py_file: # put absolute path to ..\scripts\custom_llms\aml_model.py
    # Name of class defined for custom model
    class_name: "LLamaAML"
    model_type: chat
    # Whether to track count of tokens used in LLM call
    track_tokens: true
    req_per_min: 1020
    tokens_per_min: 170000
    error_backoff_in_seconds: 60

# LLM Queue Rate Limits
user_limits:
  # MAX NUMBER OF REQUESTS TO ALLOW PER USER-ID IN SPECIFIED TIME WINDOW
  max_num_requests_in_time_window: 20
  # TIME WINDOW
  time_window_length_in_seconds: 60
scheduler_limits:
  # MAX TIME(IN SECONDS) ALLOWED FOR A REQUEST TO BE IN WAITING IN LLM QUEUE
  ttl_in_seconds: 300
  # MAX NUMBER OF REQUESTS THAT CAN BE SCHEDULED IN LLM QUEUE FOR EXECUTION
  max_queue_size: 30000




