{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from getpass import getpass\n",
    "model = \"Qwen/Qwen3-235B-A22B-Instruct-2507\"\n",
    "base_url=\"https://api-inference.modelscope.cn/v1\"\n",
    "\n",
    "from openai import OpenAI\n",
    "import re\n",
    "\n",
    "openai_api_key = \"ms-9ec1bd1b-4dd8-45c1-835b-f8ad1fa50eb8\"\n",
    "client = OpenAI(\n",
    "    api_key=openai_api_key,\n",
    ")\n",
    "\n",
    "def chat_completion_request_openai(prompt):\n",
    "    messages = [\n",
    "        {\"role\": \"user\", \"content\": prompt}\n",
    "    ]\n",
    "\n",
    "    chat_response = client.chat.completions.create(\n",
    "    model=model,\n",
    "    messages=messages,\n",
    "    temperature=1.0)\n",
    "    if chat_response.choices:\n",
    "        completion_text = chat_response.choices[0].message.content\n",
    "    else:\n",
    "        completion_text = None\n",
    "    return completion_text\n",
    "\n",
    "definitions = open(\"taxonomy_definitions_examples/definitions.txt\", \"r\").read()\n",
    "\n",
    "def openai_evaluator(trace, definitions=definitions, examples=''):\n",
    "    prompt = (\n",
    "    \"Below I will provide a multiagent system trace. provide me an analysis of the failure modes and inefficiencies as I will say below. \\n\"\n",
    "    \"In the traces, analyze the system behaviour.\"\n",
    "    \"There are several failure modes in multiagent systems I identified. I will provide them below. Tell me if you encounter any of them, as a binary yes or no. \\n\"\n",
    "    \"Also, give me a one sentence (be brief) summary of the problems with the inefficiencies or failure modes in the trace. Only mark a failure mode if you can provide an example of it in the trace, and specify that in your summary at the end\"\n",
    "    \"Also tell me whether the task is successfully completed or not, as a binary yes or no.\"\n",
    "    \"At the very end, I provide you with the definitions of the failure modes and inefficiencies. After the definitions, I will provide you with examples of the failure modes and inefficiencies for you to understand them better.\"\n",
    "    \"Tell me if you encounter any of them between the @@ symbols as I will say below, as a binary yes or no.\"\n",
    "    \"Here are the things you should answer. Start after the @@ sign and end before the next @@ sign (do not include the @@ symbols in your answer):\"\n",
    "    \"*** begin of things you should answer *** @@\"\n",
    "    \"A. Freeform text summary of the problems with the inefficiencies or failure modes in the trace: <summary>\"\n",
    "    \"B. Whether the task is successfully completed or not: <yes or no>\"\n",
    "    \"C. Whether you encounter any of the failure modes or inefficiencies:\"\n",
    "    \"1.1 Disobey Task Specification: <yes or no>\"\n",
    "    \"1.2 Disobey Role Specification: <yes or no>\"\n",
    "    \"1.3 Step Repetition: <yes or no>\"\n",
    "    \"1.4 Loss of Conversation History: <yes or no>\"\n",
    "    \"1.5 Unaware of Termination Conditions: <yes or no>\"\n",
    "    \"2.1 Conversation Reset: <yes or no>\"\n",
    "    \"2.2 Fail to Ask for Clarification: <yes or no>\"\n",
    "    \"2.3 Task Derailment: <yes or no>\"\n",
    "    \"2.4 Information Withholding: <yes or no>\"\n",
    "    \"2.5 Ignored Other Agent's Input: <yes or no>\"\n",
    "    \"2.6 Action-Reasoning Mismatch: <yes or no>\"\n",
    "    \"3.1 Premature Termination: <yes or no>\"\n",
    "    \"3.2 No or Incorrect Verification: <yes or no>\"\n",
    "    \"3.3 Weak Verification: <yes or no>\"\n",
    "    \"@@*** end of your answer ***\"\n",
    "    \"An example answer is: \\n\"\n",
    "    \"A. The task is not completed due to disobeying role specification as agents went rogue and started to chat with each other instead of completing the task. Agents derailed and verifier is not strong enough to detect it.\\n\"\n",
    "    \"B. no \\n\"\n",
    "    \"C. \\n\"\n",
    "    \"1.1 no \\n\"\n",
    "    \"1.2 no \\n\"\n",
    "    \"1.3 no \\n\"\n",
    "    \"1.4 no \\n\"\n",
    "    \"1.5 no \\n\"\n",
    "    \"1.6 yes \\n\"\n",
    "    \"2.1 no \\n\"\n",
    "    \"2.2 no \\n\"\n",
    "    \"2.3 yes \\n\"\n",
    "    \"2.4 no \\n\"\n",
    "    \"2.5 no \\n\"\n",
    "    \"2.6 yes \\n\"\n",
    "    \"2.7 no \\n\"\n",
    "    \"3.1 no \\n\"\n",
    "    \"3.2 yes \\n\"\n",
    "    \"3.3 no \\n\"   \n",
    "    \"Here is the trace: \\n\"\n",
    "    f\"{trace}\"\n",
    "    \"Also, here are the explanations (definitions) of the failure modes and inefficiencies: \\n\"\n",
    "    f\"{definitions} \\n\"\n",
    "    \"Here are some examples of the failure modes and inefficiencies: \\n\"\n",
    "    f\"{examples}\"\n",
    ")\n",
    "    return chat_completion_request_openai(prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "examples = open(\"taxonomy_definitions_examples/examples.txt\", \"r\").read()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LLM As a Judge Experiments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'traces/AppWorld/50e1ac9_2.txt'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m trace1 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtraces/AppWorld/50e1ac9_1.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mread()\n\u001b[0;32m----> 2\u001b[0m trace2 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtraces/AppWorld/50e1ac9_2.txt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m      4\u001b[0m full_trace_list \u001b[38;5;241m=\u001b[39m [trace1, trace2]\n",
      "File \u001b[0;32m~/anaconda3/envs/my-oasis-xst/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m    318\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    319\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    320\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    321\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    322\u001b[0m     )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'traces/AppWorld/50e1ac9_2.txt'"
     ]
    }
   ],
   "source": [
    "trace1 = open(\"traces/AppWorld/50e1ac9_1.txt\", \"r\").read()\n",
    "\n",
    "full_trace_list = [trace1, trace2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "openai_results = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import os\n",
    "import tiktoken\n",
    "\n",
    "dirname = 'saved_results'\n",
    "os.makedirs(dirname, exist_ok=True)\n",
    "\n",
    "for i in range(len(full_trace_list)):\n",
    "    \n",
    "    if len(full_trace_list[i] + examples) > 1048570:\n",
    "            full_trace_list[i] = full_trace_list[i][:1048570 - len(examples)]\n",
    "\n",
    "\n",
    "    try:\n",
    "        openai_evaluation = openai_evaluator(full_trace_list[i], examples=examples)\n",
    "        openai_results.append(openai_evaluation)\n",
    "        \n",
    "        # Save the current results after each evaluation\n",
    "        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:\n",
    "            pickle.dump(openai_results, f)\n",
    "            \n",
    "        # Optional: Save a backup copy every 10 evaluations\n",
    "        if (i + 1) % 10 == 0:\n",
    "            with open(f'{dirname}/o1_results_backup_{i+1}.pkl', 'wb') as f:\n",
    "                pickle.dump(openai_results, f)\n",
    "                \n",
    "        print(f\"Completed and saved evaluation {i+1}/{len(full_trace_list)}\")\n",
    "    except Exception as e:\n",
    "        print(f\"Error on evaluation {i+1}: {str(e)}\")\n",
    "        # Save results even if there's an error\n",
    "        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:\n",
    "            pickle.dump(openai_results, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "o1_results = openai_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_responses(responses):\n",
    "    \"\"\"\n",
    "    Parse the LLM responses to extract yes/no answers for each failure mode.\n",
    "    \n",
    "    Args:\n",
    "        responses: List of LLM responses evaluating traces\n",
    "        \n",
    "    Returns:\n",
    "        Dictionary mapping failure mode codes to lists of binary values (0 for no, 1 for yes)\n",
    "    \"\"\"\n",
    "    import re\n",
    "    \n",
    "    # Initialize dictionary with empty lists for each failure mode\n",
    "    failure_modes = {\n",
    "        '1.1': [], '1.2': [], '1.3': [], '1.4': [], '1.5': [],\n",
    "        '2.1': [], '2.2': [], '2.3': [], '2.4': [], '2.5': [], '2.6': [],\n",
    "        '3.1': [], '3.2': [], '3.3': []\n",
    "    }\n",
    "    \n",
    "    for i, response in enumerate(responses):\n",
    "        try:\n",
    "            # Clean up the response - remove @@ markers if present\n",
    "            cleaned_response = response.strip()\n",
    "            if cleaned_response.startswith('@@'):\n",
    "                cleaned_response = cleaned_response[2:]\n",
    "            if cleaned_response.endswith('@@'):\n",
    "                cleaned_response = cleaned_response[:-2]\n",
    "            \n",
    "            # Process each failure mode\n",
    "            for mode in failure_modes.keys():\n",
    "                # Various patterns to match different response formats\n",
    "                patterns = [\n",
    "                    # Format with C. prefix and colon\n",
    "                    rf\"C\\..*?{mode}.*?(yes|no)\",\n",
    "                    # Format with just C prefix without dot\n",
    "                    rf\"C{mode}\\s+(yes|no)\",\n",
    "                    # Format with mode directly (with or without spaces)\n",
    "                    rf\"{mode}\\s*[:]\\s*(yes|no)\",\n",
    "                    rf\"{mode}\\s+(yes|no)\",\n",
    "                    # Format with newlines\n",
    "                    rf\"{mode}\\s*\\n\\s*(yes|no)\",\n",
    "                    # Format with C prefix and newlines\n",
    "                    rf\"C\\.{mode}\\s*\\n\\s*(yes|no)\"\n",
    "                ]\n",
    "                \n",
    "                found = False\n",
    "                for pattern in patterns:\n",
    "                    matches = re.findall(pattern, cleaned_response, re.IGNORECASE | re.DOTALL)\n",
    "                    if matches:\n",
    "                        # Use the first match\n",
    "                        value = 1 if matches[0].lower() == 'yes' else 0\n",
    "                        failure_modes[mode].append(value)\n",
    "                        found = True\n",
    "                        break\n",
    "                \n",
    "                if not found:\n",
    "                    # If we still can't find a match, try a more general approach\n",
    "                    # Look for the mode number followed by any text and then yes/no\n",
    "                    general_pattern = rf\"(?:C\\.)?{mode}.*?(yes|no)\"\n",
    "                    match = re.search(general_pattern, cleaned_response, re.IGNORECASE | re.DOTALL)\n",
    "                    \n",
    "                    if match:\n",
    "                        value = 1 if match.group(1).lower() == 'yes' else 0\n",
    "                        failure_modes[mode].append(value)\n",
    "                    else:\n",
    "                        # If all attempts fail, default to 'no'\n",
    "                        print(f\"Warning: Could not find mode {mode} in response {i}\")\n",
    "                        failure_modes[mode].append(0)\n",
    "                    \n",
    "        except Exception as e:\n",
    "            print(f\"Error processing response {i}: {e}\")\n",
    "            # If there's an error, default to 'no' for all modes for this response\n",
    "            for mode in failure_modes:\n",
    "                if len(failure_modes[mode]) <= i:  # Only append if we haven't already\n",
    "                    failure_modes[mode].append(0)\n",
    "    \n",
    "    # Ensure all lists have the same length\n",
    "    max_length = max(len(values) for values in failure_modes.values())\n",
    "    for mode in failure_modes:\n",
    "        if len(failure_modes[mode]) < max_length:\n",
    "            failure_modes[mode].extend([0] * (max_length - len(failure_modes[mode])))\n",
    "    \n",
    "    return failure_modes\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import regex for pattern matching\n",
    "import re\n",
    "\n",
    "# Process the OpenAI evaluation results\n",
    "failure_mode_results_o1 = parse_responses(o1_results)\n",
    "\n",
    "# Print the first few entries of each failure mode to verify\n",
    "for mode, values in failure_mode_results_o1.items():\n",
    "    print(f\"{mode}: {values[:5]} (total yes: {sum(values)}/{len(values)}, {round(sum(values)/len(values)*100, 2)}%)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the sizes of each source\n",
    "source_sizes = [30,30,30]\n",
    "\n",
    "# Calculate the starting index for each source\n",
    "source_indices = [0] #[0, 28, 58, 88, 119]\n",
    "for size in source_sizes[:-1]:\n",
    "    source_indices.append(source_indices[-1] + size)\n",
    "\n",
    "# print(source_indices)\n",
    "# Dictionary to store average scores for each failure mode across sources\n",
    "average_scores_by_source = {}\n",
    "\n",
    "# Calculate average scores for each failure mode across the 5 sources\n",
    "for mode, values in failure_mode_results_o1.items():\n",
    "    source_averages = []\n",
    "    # print(len(values))\n",
    "\n",
    "    # Calculate average for each source\n",
    "    for i in range(len(source_sizes)):\n",
    "        start_idx = source_indices[i]\n",
    "        end_idx = start_idx + source_sizes[i]\n",
    "        if i == 4:\n",
    "            end_idx = len(values)\n",
    "        source_values = values[start_idx:end_idx]\n",
    "        \n",
    "                # Check if source_values is not empty to avoid division by zero\n",
    "        if len(source_values) > 0:\n",
    "            avg_score = sum(source_values) / len(source_values)\n",
    "        else:\n",
    "            print(f\"Warning: Source {i} has no values for mode {mode}\")\n",
    "            avg_score = 0\n",
    "            \n",
    "        source_averages.append(avg_score)\n",
    "    \n",
    "    average_scores_by_source[mode] = source_averages\n",
    "    \n",
    "    # Print the average scores for each source\n",
    "    print(f\"{mode}: {[round(score * 100, 2) for score in source_averages]}%\")\n",
    "\n",
    "# Print a sample of the dictionary structure\n",
    "print(\"\\nSample of average_scores_by_source dictionary:\")\n",
    "sample_key = list(average_scores_by_source.keys())[0]\n",
    "print(f\"{sample_key}: {average_scores_by_source[sample_key]}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "my-oasis-xst",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
