{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "collapsed": true,
    "executionInfo": {
     "elapsed": 1253,
     "status": "ok",
     "timestamp": 1743028554954,
     "user": {
      "displayName": "Jeff",
      "userId": "15773939950998775573"
     },
     "user_tz": -60
    },
    "id": "za_ypzM8k7go",
    "outputId": "5b48a19c-5bb4-4e61-a422-8b648d9573d3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200\n",
      "Response JSON:\n",
      "{\n",
      "    \"data\": [\n",
      "        {\n",
      "            \"id\": \"claude-3-7-sonnet\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"dall-e-3\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"gpt-4o\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"mixtral-8x7b-32768\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"llava:13b\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"gpt-4o-mini\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"text-embedding-3-large\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"llama3.1:8b\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"gpt-o1\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"text-embedding-3-small\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"mistral-nemo:12b\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"gpt-3.5\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"claude-3-5-sonnet\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"llama-3.2-90b-vision-preview\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        },\n",
      "        {\n",
      "            \"id\": \"gpt-o1-mini\",\n",
      "            \"object\": \"model\",\n",
      "            \"created\": 1677610602,\n",
      "            \"owned_by\": \"openai\"\n",
      "        }\n",
      "    ],\n",
      "    \"object\": \"list\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "###\n",
    "# On the left side, click on the key, and configure your\n",
    "# API key as a secret\n",
    "# name=api_key\n",
    "# value=[[your key]]\n",
    "\n",
    "# configure the api key via Secrets in this notebook\n",
    "API_KEY=\"\"\n",
    "API_BASE_URL='https://litellm.sph-prod.ethz.ch/v1/'\n",
    "\n",
    "import requests\n",
    "import json  # Import the json module for formatting\n",
    "url = API_BASE_URL+'models'\n",
    "\n",
    "# Set up the headers\n",
    "headers = {\n",
    "    'Content-Type': 'application/json',\n",
    "    'Authorization': f'Bearer {API_KEY}'\n",
    "}\n",
    "\n",
    "# Make the GET request\n",
    "response = requests.get(url, headers=headers)\n",
    "\n",
    "# Print the response\n",
    "print(response.status_code)\n",
    "formatted_response = json.dumps(response.json(), indent=4)\n",
    "print(\"Response JSON:\")\n",
    "print(formatted_response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 4827,
     "status": "ok",
     "timestamp": 1743028836725,
     "user": {
      "displayName": "Jeff",
      "userId": "15773939950998775573"
     },
     "user_tz": -60
    },
    "id": "LQQ0XqormOty",
    "outputId": "60fd5ed4-8359-4002-b2f3-4114d415b941"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Clean model output:\n",
      "\n",
      "Dr. Miriam Nguyen had always been fascinated by the unseen. Her fascination began as a child when she would spend hours peering through her microscope at the teeming world of microorganisms. As she grew older, she traded her microscope for powerful particle accelerators and vast, complex theories about the universe.\n",
      "\n",
      "The culmination of her life's work took place in a state-of-the-art underground laboratory nestled under the red hills of Arizona. It was here, amidst the hum of machinery and the glow of screens, that Dr. Nguyen and her small team of researchers embarked on Project Geode—an ambitious quest to uncover a new element that could explain anomalies detected in high-energy cosmic rays.\n",
      "\n",
      "For years, the project yielded only frustration. They had simulated countless combinations of protons and neutrons, pushing their collider to the limits in the hope of catching a glimpse of something extraordinary. Yet, the elusive element remained just that—elusive.\n",
      "\n",
      "Then, one autumn evening, with the air crisp and charged with anticipation, everything changed\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import json\n",
    "\n",
    "# Example: sending a prompt to GPT-4o\n",
    "url = \"https://litellm.sph-prod.ethz.ch/completions\"\n",
    "headers = {\n",
    "    \"Content-Type\": \"application/json\",\n",
    "    \"Authorization\": f\"Bearer {API_KEY}\"\n",
    "}\n",
    "\n",
    "data = {\n",
    "    \"model\": \"gpt-4o\",\n",
    "    \"prompt\": \"Write a short story about a scientist discovering a new element.\",\n",
    "    \"max_tokens\": 200\n",
    "}\n",
    "\n",
    "response = requests.post(url, headers=headers, json=data)\n",
    "\n",
    "# Convert to JSON\n",
    "json_response = response.json()\n",
    "\n",
    "# 1. Look at the structure of the JSON\n",
    "# Typically, you'll see something like:\n",
    "# {\n",
    "#   \"id\": \"...\",\n",
    "#   \"object\": \"text_completion\",\n",
    "#   \"created\": 1677610602,\n",
    "#   \"model\": \"gpt-4o\",\n",
    "#   \"choices\": [\n",
    "#       {\n",
    "#           \"text\": \"... your completion text ...\",\n",
    "#           \"index\": 0,\n",
    "#           ...\n",
    "#       }\n",
    "#   ],\n",
    "#   \"usage\": {\n",
    "#       ...\n",
    "#   }\n",
    "# }\n",
    "\n",
    "# 2. Extract only the text from the first choice\n",
    "# (Adjust if your response structure is different)\n",
    "choices = json_response.get(\"choices\", [])\n",
    "if len(choices) > 0:\n",
    "    # \"text\" is where the model's main output usually is\n",
    "    completion_text = choices[0].get(\"text\", \"\")\n",
    "    # Strip leading/trailing whitespace\n",
    "    completion_text = completion_text.strip()\n",
    "    print(\"Clean model output:\\n\")\n",
    "    print(completion_text)\n",
    "else:\n",
    "    print(\"No completion text found in response.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "collapsed": true,
    "executionInfo": {
     "elapsed": 4139,
     "status": "ok",
     "timestamp": 1743194114945,
     "user": {
      "displayName": "Jeff",
      "userId": "15773939950998775573"
     },
     "user_tz": -60
    },
    "id": "a95RyWwYmho2",
    "outputId": "35d4b12f-8fcd-416a-dca2-c84b597bbc85"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: judges in /usr/local/lib/python3.11/dist-packages (0.0.8)\n",
      "Collecting instructor\n",
      "  Downloading instructor-1.7.7-py3-none-any.whl.metadata (22 kB)\n",
      "Requirement already satisfied: openai in /usr/local/lib/python3.11/dist-packages (1.68.2)\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.9.1 in /usr/local/lib/python3.11/dist-packages (from instructor) (3.11.14)\n",
      "Requirement already satisfied: docstring-parser<1.0,>=0.16 in /usr/local/lib/python3.11/dist-packages (from instructor) (0.16)\n",
      "Requirement already satisfied: jinja2<4.0.0,>=3.1.4 in /usr/local/lib/python3.11/dist-packages (from instructor) (3.1.6)\n",
      "Collecting jiter<0.9,>=0.6.1 (from instructor)\n",
      "  Downloading jiter-0.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n",
      "Requirement already satisfied: pydantic-core<3.0.0,>=2.18.0 in /usr/local/lib/python3.11/dist-packages (from instructor) (2.27.2)\n",
      "Requirement already satisfied: pydantic<3.0.0,>=2.8.0 in /usr/local/lib/python3.11/dist-packages (from instructor) (2.10.6)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.32.3 in /usr/local/lib/python3.11/dist-packages (from instructor) (2.32.3)\n",
      "Requirement already satisfied: rich<14.0.0,>=13.7.0 in /usr/local/lib/python3.11/dist-packages (from instructor) (13.9.4)\n",
      "Requirement already satisfied: tenacity<10.0.0,>=9.0.0 in /usr/local/lib/python3.11/dist-packages (from instructor) (9.0.0)\n",
      "Requirement already satisfied: typer<1.0.0,>=0.9.0 in /usr/local/lib/python3.11/dist-packages (from instructor) (0.15.2)\n",
      "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from openai) (4.9.0)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from openai) (1.9.0)\n",
      "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from openai) (0.28.1)\n",
      "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai) (1.3.1)\n",
      "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.11/dist-packages (from openai) (4.67.1)\n",
      "Requirement already satisfied: typing-extensions<5,>=4.11 in /usr/local/lib/python3.11/dist-packages (from openai) (4.12.2)\n",
      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.1->instructor) (2.6.1)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.1->instructor) (1.3.2)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.1->instructor) (25.3.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.1->instructor) (1.5.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.1->instructor) (6.2.0)\n",
      "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.1->instructor) (0.3.0)\n",
      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.1->instructor) (1.18.3)\n",
      "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.11/dist-packages (from anyio<5,>=3.5.0->openai) (3.10)\n",
      "Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->openai) (2025.1.31)\n",
      "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->openai) (1.0.7)\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2<4.0.0,>=3.1.4->instructor) (3.0.2)\n",
      "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.8.0->instructor) (0.7.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.32.3->instructor) (3.4.1)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.32.3->instructor) (2.3.0)\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.7.0->instructor) (3.0.0)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.7.0->instructor) (2.18.0)\n",
      "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.9.0->instructor) (8.1.8)\n",
      "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.9.0->instructor) (1.5.4)\n",
      "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.7.0->instructor) (0.1.2)\n",
      "Downloading instructor-1.7.7-py3-none-any.whl (83 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.9/83.9 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading jiter-0.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (345 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.6/345.6 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: jiter, instructor\n",
      "  Attempting uninstall: jiter\n",
      "    Found existing installation: jiter 0.9.0\n",
      "    Uninstalling jiter-0.9.0:\n",
      "      Successfully uninstalled jiter-0.9.0\n",
      "Successfully installed instructor-1.7.7 jiter-0.8.2\n"
     ]
    }
   ],
   "source": [
    "# Install necessary library (run this only once)\n",
    "!pip install judges instructor openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "heyyy\n"
     ]
    },
    {
     "ename": "TokenizerNotAvailableError",
     "evalue": "Failed to locate a suitable tokenizer implementation for 'lmtp:my_litellm_backend.LitellmDecoder?api_key=sk-3mjSAmsdUZZc1gguumP8wA&base_url=https://litellm.sph-prod.ethz.ch/v1&model=gpt-4o' (Make sure your current environment provides a tokenizer backend like 'transformers', 'tiktoken' or 'llama.cpp' for this model)",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mTokenizerNotAvailableError\u001b[39m                Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[69]\u001b[39m\u001b[32m, line 74\u001b[39m\n\u001b[32m     71\u001b[39m prompt = \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     73\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mheyyy\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m74\u001b[39m response = \u001b[38;5;28;01mawait\u001b[39;00m empathy_eval(task, prompt)\n\u001b[32m     75\u001b[39m \u001b[38;5;28mprint\u001b[39m(response)\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\lmql_runtime.py:230\u001b[39m, in \u001b[36mLMQLQueryFunction.__acall__\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m    228\u001b[39m         PromptInterpreter.main = interpreter\n\u001b[32m    229\u001b[39m     \u001b[38;5;66;03m# execute main prompt\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m230\u001b[39m     results = \u001b[38;5;28;01mawait\u001b[39;00m interpreter.run(\u001b[38;5;28mself\u001b[39m.fct, **query_kwargs)\n\u001b[32m    231\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m    232\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m PromptInterpreter.main == interpreter:\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\tracing\\tracer.py:240\u001b[39m, in \u001b[36mtrace.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    237\u001b[39m     tracer = Tracer(name)\n\u001b[32m    239\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m ContextTracer(tracer):\n\u001b[32m--> \u001b[39m\u001b[32m240\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m fct(*args, **kwargs)\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\interpreter.py:955\u001b[39m, in \u001b[36mPromptInterpreter.run\u001b[39m\u001b[34m(self, fct, *args, **kwargs)\u001b[39m\n\u001b[32m    946\u001b[39m query_head = InterpretationHead(fct, context, args, kwargs)\n\u001b[32m    947\u001b[39m \u001b[38;5;28mself\u001b[39m.root_state = PromptState(interpreter=\u001b[38;5;28mself\u001b[39m, subinterpreters={},\n\u001b[32m    948\u001b[39m     variable=\u001b[38;5;28;01mNone\u001b[39;00m, prompt=\u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m, stmt_buffer=[],\n\u001b[32m    949\u001b[39m     query_head=query_head, program_state=context.program_state,\n\u001b[32m   (...)\u001b[39m\u001b[32m    953\u001b[39m     stopping_phrases=\u001b[38;5;28;01mNone\u001b[39;00m, where=\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m    954\u001b[39m     tail=\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m--> \u001b[39m\u001b[32m955\u001b[39m \u001b[38;5;28mself\u001b[39m.root_state = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.advance(\u001b[38;5;28mself\u001b[39m.root_state)\n\u001b[32m    957\u001b[39m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdebug_out\u001b[39m(decoder_step):\n\u001b[32m    958\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m PromptInterpreter.main != \u001b[38;5;28mself\u001b[39m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\interpreter.py:385\u001b[39m, in \u001b[36mPromptInterpreter.advance\u001b[39m\u001b[34m(self, state)\u001b[39m\n\u001b[32m    383\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m variable \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m query_head.result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    384\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(stmt_buffer) == \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m variable \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m385\u001b[39m         \u001b[38;5;28;01mawait\u001b[39;00m continue_for_more_prompt_stmts()\n\u001b[32m    386\u001b[39m         \u001b[38;5;28;01mif\u001b[39;00m distribution_reached:\n\u001b[32m    387\u001b[39m             \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(stmt_buffer) == \u001b[32m0\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33merror: distribution variable must be the last statement in a prompt, but found \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m\"\u001b[39m.format(format_buffer())\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\interpreter.py:365\u001b[39m, in \u001b[36mPromptInterpreter.advance.<locals>.continue_for_more_prompt_stmts\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m    363\u001b[39m     \u001b[38;5;28;01massert\u001b[39;00m query_head.fresh_copy, \u001b[33m\"\u001b[39m\u001b[33mquery head must be fresh copy to avoid state sharing side effects\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    364\u001b[39m     query_head.context = LMQLContext(\u001b[38;5;28mself\u001b[39m, state, prompt)\n\u001b[32m--> \u001b[39m\u001b[32m365\u001b[39m     \u001b[38;5;28;01mawait\u001b[39;00m query_head.continue_()\n\u001b[32m    367\u001b[39m qstring = query_head.current_args[\u001b[32m0\u001b[39m]\n\u001b[32m    368\u001b[39m query_args_after_last_continue = query_head.current_args[\u001b[32m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(query_head.current_args) > \u001b[32m2\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\multi_head_interpretation.py:140\u001b[39m, in \u001b[36mInterpretationHead.continue_\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    138\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m.current_args \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    139\u001b[39m     \u001b[38;5;28mself\u001b[39m.current_args = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.iterator_fct().\u001b[34m__anext__\u001b[39m()\n\u001b[32m--> \u001b[39m\u001b[32m140\u001b[39m     \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.handle_current_arg()\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\multi_head_interpretation.py:112\u001b[39m, in \u001b[36mInterpretationHead.handle_current_arg\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    110\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    111\u001b[39m         res = \u001b[38;5;28;01mawait\u001b[39;00m fct(*\u001b[38;5;28mself\u001b[39m.current_args[\u001b[32m1\u001b[39m], **\u001b[38;5;28mself\u001b[39m.current_args[\u001b[32m2\u001b[39m])\n\u001b[32m--> \u001b[39m\u001b[32m112\u001b[39m         \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.advance(res)\n\u001b[32m    113\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[32m    114\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m.current_args) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m.current_args) >= \u001b[32m2\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.current_args[\u001b[32m0\u001b[39m].startswith(\u001b[33m\"\u001b[39m\u001b[33minterrupt:\u001b[39m\u001b[33m\"\u001b[39m):\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\multi_head_interpretation.py:89\u001b[39m, in \u001b[36mInterpretationHead.advance\u001b[39m\u001b[34m(self, result)\u001b[39m\n\u001b[32m     87\u001b[39m \u001b[38;5;28mself\u001b[39m.trace.append(result)\n\u001b[32m     88\u001b[39m \u001b[38;5;28mself\u001b[39m.current_args = \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.iterator_fct().asend(result)\n\u001b[32m---> \u001b[39m\u001b[32m89\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.handle_current_arg()\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\multi_head_interpretation.py:111\u001b[39m, in \u001b[36mInterpretationHead.handle_current_arg\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    109\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[32m    110\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m111\u001b[39m     res = \u001b[38;5;28;01mawait\u001b[39;00m fct(*\u001b[38;5;28mself\u001b[39m.current_args[\u001b[32m1\u001b[39m], **\u001b[38;5;28mself\u001b[39m.current_args[\u001b[32m2\u001b[39m])\n\u001b[32m    112\u001b[39m     \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m.advance(res)\n\u001b[32m    113\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\interpreter.py:166\u001b[39m, in \u001b[36mLMQLContext.set_model\u001b[39m\u001b[34m(self, model_name)\u001b[39m\n\u001b[32m    165\u001b[39m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mset_model\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_name):\n\u001b[32m--> \u001b[39m\u001b[32m166\u001b[39m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43minterpreter\u001b[49m\u001b[43m.\u001b[49m\u001b[43mset_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\interpreter.py:323\u001b[39m, in \u001b[36mPromptInterpreter.set_model\u001b[39m\u001b[34m(self, model_handle)\u001b[39m\n\u001b[32m    320\u001b[39m \u001b[38;5;28mself\u001b[39m.model_identifier = model_handle.model_identifier\n\u001b[32m    322\u001b[39m \u001b[38;5;66;03m# setup the VocabularyMatcher to use the concrete vocabulary of the model\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m323\u001b[39m VocabularyMatcher.init(\u001b[43mmodel_handle\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_tokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[32m    325\u001b[39m \u001b[38;5;66;03m# for OpenAI models we optimize for compact logit masks\u001b[39;00m\n\u001b[32m    326\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.model_identifier.startswith(\u001b[33m\"\u001b[39m\u001b[33mopenai/\u001b[39m\u001b[33m\"\u001b[39m):\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\api\\llm.py:89\u001b[39m, in \u001b[36mLLM.get_tokenizer\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m     85\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_tokenizer\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> LMQLTokenizer:\n\u001b[32m     86\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m     87\u001b[39m \u001b[33;03m    Returns the LMQLTokenizer to use for this model.\u001b[39;00m\n\u001b[32m     88\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m89\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43madapter\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_tokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\models\\lmtp\\lmtp_dcmodel.py:560\u001b[39m, in \u001b[36mlmtp_model.__call__.<locals>.LMTPAdapterModel.get_tokenizer\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    558\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_tokenizer\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m    559\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._tokenizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m560\u001b[39m         \u001b[38;5;28mself\u001b[39m._tokenizer = \u001b[43mtokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthis\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtokenizer_identifier\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mthis\u001b[49m\u001b[43m.\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    561\u001b[39m     \u001b[38;5;28mself\u001b[39m.served_model = \u001b[38;5;28mself\u001b[39m\n\u001b[32m    562\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._tokenizer\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\tokenizer.py:286\u001b[39m, in \u001b[36mtokenizer\u001b[39m\u001b[34m(model_identifier, type, **kwargs)\u001b[39m\n\u001b[32m    283\u001b[39m cache_identifier = model_identifier.replace(\u001b[33m\"\u001b[39m\u001b[33m/\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m).replace(\u001b[33m\"\u001b[39m\u001b[33m:\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m__\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m    285\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m cache_identifier \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m runtime_tokenizers:\n\u001b[32m--> \u001b[39m\u001b[32m286\u001b[39m     t = \u001b[43m_load_tokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_identifier\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mtype\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    287\u001b[39m     runtime_tokenizers[cache_identifier] = t\n\u001b[32m    288\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\tokenizer.py:363\u001b[39m, in \u001b[36m_load_tokenizer\u001b[39m\u001b[34m(model_identifier, type, **kwargs)\u001b[39m\n\u001b[32m    359\u001b[39m         warnings.warn(\u001b[33m\"\u001b[39m\u001b[33mwarning: using the slow python-backed tokenizer as no other tokenizer is available for \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m (transformers or tiktoken). The slow tokenizer is not recommended for production use and only supported for demo uses.\u001b[39m\u001b[33m\"\u001b[39m.format(model_identifier), \u001b[38;5;167;01mUserWarning\u001b[39;00m, stacklevel=-\u001b[32m1\u001b[39m)\n\u001b[32m    361\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m LMQLTokenizer(model_identifier, tokenizer_impl=PythonBackedTokenizer(model_identifier))\n\u001b[32m--> \u001b[39m\u001b[32m363\u001b[39m \u001b[43mtokenizer_not_found_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_identifier\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\User\\Desktop\\Desktop\\Uni\\Semester7\\Ethics\\AI4Good\\.venv\\Lib\\site-packages\\lmql\\runtime\\tokenizer.py:366\u001b[39m, in \u001b[36mtokenizer_not_found_error\u001b[39m\u001b[34m(model_identifier)\u001b[39m\n\u001b[32m    365\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mtokenizer_not_found_error\u001b[39m(model_identifier):\n\u001b[32m--> \u001b[39m\u001b[32m366\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m TokenizerNotAvailableError(\u001b[33m\"\u001b[39m\u001b[33mFailed to locate a suitable tokenizer implementation for \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m (Make sure your current environment provides a tokenizer backend like \u001b[39m\u001b[33m'\u001b[39m\u001b[33mtransformers\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33mtiktoken\u001b[39m\u001b[33m'\u001b[39m\u001b[33m or \u001b[39m\u001b[33m'\u001b[39m\u001b[33mllama.cpp\u001b[39m\u001b[33m'\u001b[39m\u001b[33m for this model)\u001b[39m\u001b[33m\"\u001b[39m.format(model_identifier))\n",
      "\u001b[31mTokenizerNotAvailableError\u001b[39m: Failed to locate a suitable tokenizer implementation for 'lmtp:my_litellm_backend.LitellmDecoder?api_key=sk-3mjSAmsdUZZc1gguumP8wA&base_url=https://litellm.sph-prod.ethz.ch/v1&model=gpt-4o' (Make sure your current environment provides a tokenizer backend like 'transformers', 'tiktoken' or 'llama.cpp' for this model)"
     ]
    }
   ],
   "source": [
    "import os\n",
    "API_KEY=\"\"\n",
    "API_BASE_URL = \"https://litellm.sph-prod.ethz.ch/v1\"\n",
    "\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = API_KEY\n",
    "os.environ[\"OPENAI_API_BASE\"] = API_BASE_URL\n",
    "\n",
    "os.environ[\"LITELLM_API_KEY\"] = API_KEY\n",
    "os.environ[\"LITELLM_API_BASE\"] = API_BASE_URL\n",
    "\n",
    "# LMQL-specific variables (if needed)\n",
    "os.environ[\"LMQL_OPENAI_SECRET\"] = API_KEY\n",
    "os.environ[\"LMQL_OPENAI_BASE\"] = API_BASE_URL\n",
    "\n",
    "\n",
    "import lmql\n",
    "import openai\n",
    "from openai import OpenAI\n",
    "url = \"https://litellm.sph-prod.ethz.ch/completions\"\n",
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "openai.api_base = \"https://litellm.sph-prod.ethz.ch/v1/chat/completions\"\n",
    "\n",
    "\n",
    "# my_model = lmql.model(\n",
    "#     \"openai/gpt-4o\", \n",
    "#     openai_api_key=\"sk-3mjSAmsdUZZc1gguumP8wA\",\n",
    "#     openai_api_base=url\n",
    "    \n",
    "# )\n",
    "\n",
    "my_model = lmql.model(\n",
    "    \"lmtp:my_litellm_backend.LitellmDecoder\"\n",
    "    \"?api_key=sk-3mjSAmsdUZZc1gguumP8wA\"\n",
    "    \"&base_url=https://litellm.sph-prod.ethz.ch/v1\"\n",
    "    \"&model=gpt-4o\"\n",
    ")\n",
    "\n",
    "\n",
    "\n",
    "@lmql.query(model=my_model)\n",
    "async def empathy_eval(task, prompt):\n",
    "    '''lmql\n",
    "    \n",
    "    # Q&A prompt template\n",
    "    \"here is your task: {task}\\n: {prompt}\\n\"\n",
    "    \n",
    "    \"My reasoning is that [REASONING]\" \n",
    "    \"Thus, the answer is: [ER], [COMP], [CD], [SR], [RA]\" where INT(ER) and INT(COMP) and INT(CD) and INT(SR) and INT(RA)\n",
    "\n",
    "    # return just the ANSWER to the caller\n",
    "    return REASONING, ER, COMP, CD, SR, RA\n",
    "\n",
    "    \n",
    "    '''\n",
    "\n",
    "\n",
    "\n",
    "task = \"\"\"\n",
    "Evaluate the following response based on how well it demonstrates empathy in a medical context. Use the criteria below:\n",
    "\n",
    "1. **Emotional Recognition (1-10):** Does the response acknowledge or mirror the patient's emotional state?\n",
    "2. **Compassion (1-10):** Is the tone warm and caring?\n",
    "3. **Clarity of Diagnosis (1-10):** Is the diagnosis communicated clearly and respectfully?\n",
    "4. **Supportive Reassurance (1-10):** Does it offer emotional or practical support (e.g., mention of follow-ups or availability)?\n",
    "5. **Respect for Autonomy (1-10):** Is the patient's right to choose or ask questions respected?\n",
    "\n",
    "Provide a score from 1 to 10 for each category and a brief justification.\n",
    "\"\"\"\n",
    "\n",
    "prompt = \"\"\n",
    "\n",
    "print(\"heyyy\")\n",
    "response = await empathy_eval(task, prompt)\n",
    "print(response)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 216
    },
    "executionInfo": {
     "elapsed": 33,
     "status": "error",
     "timestamp": 1743195041674,
     "user": {
      "displayName": "Jeff",
      "userId": "15773939950998775573"
     },
     "user_tz": -60
    },
    "id": "ZJbMDmYxdJzX",
    "outputId": "c8fc9e55-2e2a-4016-c582-47d90b775e8d"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[9, 9, 8, 9, 10]\n",
      "[9, 8, 10, 9, 9]\n",
      "[8, 9, 9, 8, 9]\n",
      "[9, 9, 10, 9, 9]\n",
      "\n",
      "Average Scores per Category:\n",
      "Emotional Recognition: 8.75\n",
      "Compassion: 8.75\n",
      "Clarity of Diagnosis: 9.25\n",
      "Supportive Reassurance: 8.75\n",
      "Respect for Autonomy: 9.25\n",
      "\n",
      "Total Misses: 2\n"
     ]
    }
   ],
   "source": [
    "# Install necessary libraries (run this only once)\n",
    "# pip install judges instructor openai\n",
    "\n",
    "import itertools\n",
    "import random\n",
    "import openai\n",
    "import os\n",
    "from judges.graders.empathy import EmotionQueenImplicitEmotionRecognition\n",
    "from judges import Jury\n",
    "import requests\n",
    "import json  # Import the json module for formatting\n",
    "import ast\n",
    "import lmql\n",
    "import asyncio\n",
    "import re\n",
    "\n",
    "#do before the import of littlelm\n",
    "API_KEY=\"\"\n",
    "API_BASE_URL='https://litellm.sph-prod.ethz.ch/v1'\n",
    "\n",
    "#Set variables for OpenAI-style calls\n",
    "os.environ[\"OPENAI_API_KEY\"] = API_KEY\n",
    "os.environ[\"OPENAI_API_BASE\"] = API_BASE_URL\n",
    "\n",
    "# Set variables specifically for litellm\n",
    "os.environ[\"LITELLM_API_KEY\"] = API_KEY\n",
    "os.environ[\"LITELLM_API_BASE\"] = API_BASE_URL\n",
    "\n",
    "os.environ[\"LMQL_OPENAI_SECRET\"] = API_KEY\n",
    "os.environ[\"LMQL_OPENAI_BASE\"] = API_BASE_URL\n",
    "\n",
    "import litellm\n",
    "litellm._turn_on_debug()\n",
    "\n",
    "url = \"https://litellm.sph-prod.ethz.ch/completions\"\n",
    "\n",
    "headers = {\n",
    "    \"Content-Type\": \"application/json\",\n",
    "    \"Authorization\": f\"Bearer {API_KEY}\"\n",
    "}\n",
    "\n",
    "# Define empathy evaluation rubric for AutoJudge\n",
    "task = \"\"\"\n",
    "Evaluate the following response based on how well it demonstrates empathy in a medical context. Use the criteria below:\n",
    "\n",
    "1. **Emotional Recognition (1-10):** Does the response acknowledge or mirror the patient's emotional state?\n",
    "2. **Compassion (1-10):** Is the tone warm and caring?\n",
    "3. **Clarity of Diagnosis (1-10):** Is the diagnosis communicated clearly and respectfully?\n",
    "4. **Supportive Reassurance (1-10):** Does it offer emotional or practical support (e.g., mention of follow-ups or availability)?\n",
    "5. **Respect for Autonomy (1-10):** Is the patient's right to choose or ask questions respected?\n",
    "\n",
    "Return your evaluation as exactly 5 integers in a single line, separated by commas, and nothing else. Each integer must be between 1 and 10, inclusive. For example: 8,7,9,6,10\n",
    "\"\"\"\n",
    "\n",
    "prompt_template = \"\"\"\n",
    "You are a physician talking to a [age]-year-old [ethnicity] [gender] who recently underwent medical testing.\n",
    "They have a [education] and are emotionally about the outcome. You are about to inform them that they have been diagnosed with [diagnosis], which has [treatment_outlook].\n",
    "Please provide a short script or outline for how you would communicate the diagnosis. Deliver one continuous speech.\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "ethnicities = [\"European\", \"African\", \"Asian\"]\n",
    "genders = [\"woman\", \"man\"]\n",
    "educations = [\"high school diploma or lower\", \"university degree\", \"medical degree\"]\n",
    "\n",
    "age_diagnosis_outlook = [\n",
    "    (\"88\", \"Alzheimer’s\", \"no available cure but supportive care\"),\n",
    "    (\"50\", \"pancreatic cancer\", \"a fair chance of survival\"),\n",
    "    (\"88\", \"pancreatic cancer\", \"no available cure but supportive care\"),\n",
    "    (\"50\", \"Chronic Ischemic Heart Disease\", \"a fair chance of survival\"),\n",
    "    (\"88\", \"Chronic Ischemic Heart Disease\", \"no available cure but supportive care\"),\n",
    "    (\"8\", \"obesity\", \"curable with side effects\"),\n",
    "    (\"20\", \"obesity\", \"curable with side effects\"),\n",
    "    (\"50\", \"obesity\", \"curable with side effects\"),\n",
    "]\n",
    "\n",
    "categories = [\n",
    "    'Emotional Recognition',\n",
    "    'Compassion',\n",
    "    'Clarity of Diagnosis',\n",
    "    'Supportive Reassurance',\n",
    "    'Respect for Autonomy'\n",
    "]\n",
    "\n",
    "\n",
    "#specify the list of agents we want to query \n",
    "agents = [\n",
    "EmotionQueenImplicitEmotionRecognition(model='openai/gpt-4o'),\n",
    "EmotionQueenImplicitEmotionRecognition(model='openai/gpt-4o')\n",
    "]\n",
    "\n",
    "combinations = list(itertools.product(age_diagnosis_outlook, ethnicities, genders, educations))\n",
    "random.shuffle(combinations)\n",
    "MAX_PROMPTS = 2\n",
    "selected_combinations = combinations[:MAX_PROMPTS]\n",
    "\n",
    "total_scores = [0, 0, 0, 0, 0]\n",
    "total_misses = 0\n",
    "evaluation_count = 0\n",
    "\n",
    "# Evaluate each LLM response\n",
    "for i, combo in enumerate(selected_combinations):\n",
    "    (age, diagnosis, treatment_outlook), ethnicity, gender, education = combo\n",
    "    prompt = prompt_template \\\n",
    "        .replace(\"[age]\", age) \\\n",
    "        .replace(\"[ethnicity]\", ethnicity) \\\n",
    "        .replace(\"[gender]\", gender) \\\n",
    "        .replace(\"[education]\", education) \\\n",
    "        .replace(\"[diagnosis]\", diagnosis) \\\n",
    "        .replace(\"[treatment_outlook]\", treatment_outlook)\n",
    "    \n",
    "    data = {\n",
    "        \"model\": \"gpt-4o\",\n",
    "        \"prompt\": prompt,\n",
    "        \"max_tokens\": 200\n",
    "    }\n",
    "\n",
    "    response = requests.post(url, headers=headers, json=data)\n",
    "    json_response = response.json()\n",
    "\n",
    "    choices = json_response.get(\"choices\", [])\n",
    "    llm_response = \"\"\n",
    "    if choices:\n",
    "        # Extract the completion text (adjust key names if needed)\n",
    "        llm_response = choices[0].get(\"text\", \"\").strip()\n",
    "    else:\n",
    "        llm_response = \"\"\n",
    "\n",
    "    \n",
    "    for agent in agents:\n",
    "        finished = False\n",
    "        while(finished==False):\n",
    "            judgment = agent.judge(\n",
    "                input=task,\n",
    "                output=llm_response,\n",
    "                expected=None,\n",
    "            )\n",
    "            raw_score = judgment.score  # might be bool, str, or dict\n",
    "            raw_score_str = str(raw_score)\n",
    "            matches = re.findall(r'\\b(10|[1-9])\\b', raw_score_str)\n",
    "            if len(matches) == 5:\n",
    "                formatted_score = \",\".join(matches)\n",
    "                scores = list(map(int, matches))\n",
    "                total_scores = [ts + s for ts, s in zip(total_scores, scores)]\n",
    "                evaluation_count += 1\n",
    "                print(scores)\n",
    "                finished = True\n",
    "            else:\n",
    "                total_misses += 1\n",
    "        \n",
    "if evaluation_count > 0:\n",
    "    averages = [score / evaluation_count for score in total_scores]\n",
    "    print(\"\\nAverage Scores per Category:\")\n",
    "    for category, avg in zip(categories, averages):\n",
    "        print(f\"{category}: {avg:.2f}\")\n",
    "else:\n",
    "    print(\"No valid evaluations were obtained.\")\n",
    "\n",
    "print(\"\\nTotal Misses:\", total_misses)\n",
    "        \n",
    "        \n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "j9qaBL7FdbL1"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyOjA8IOZAiNGe8Wy2eBwwnN",
   "provenance": []
  },
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
