{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1622e7a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import asyncio\n",
    "import logging\n",
    "import sys\n",
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from functools import partial\n",
    "import time\n",
    "import os\n",
    "from datetime import datetime, timedelta\n",
    "import json\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "import matplotlib.pyplot as plt\n",
    "import google.generativeai as genai\n",
    "\n",
    "current_path = os.getcwd()\n",
    "\n",
    "sys.path.append(os.path.join(current_path, \"../..\"))\n",
    "from helpers.constants import (\n",
    "    #     S3,\n",
    "    #     S3_BUCKET_NAME,\n",
    "    MODEL_TOKEN_LIMITS,\n",
    "    MODEL_NAME_TO_SOURCE,\n",
    ")\n",
    "from helpers.prompts import (\n",
    "    ZERO_SHOT_PROMPT,\n",
    "    ZERO_SHOT_ACLED_PROMPT,\n",
    "    ZERO_SHOT_JOINT_QUESTION_PROMPT,\n",
    "    SCRATCH_PAD_PROMPT,\n",
    "    SCRATCH_PAD_ACLED_PROMPT,\n",
    "    SCRATCH_PAD_JOINT_QUESTION_PROMPT,\n",
    "    REFORMAT_SINGLE_PROMPT,\n",
    "    REFORMAT_PROMPT,\n",
    "    HUMAN_JOINT_PROMPT_1,\n",
    "    HUMAN_JOINT_PROMPT_2,\n",
    "    HUMAN_JOINT_PROMPT_3,\n",
    "    HUMAN_JOINT_PROMPT_4,\n",
    ")\n",
    "from helpers import model_utils, model_eval, data_utils, dates, decorator, keys  # noqa: E402\n",
    "\n",
    "\n",
    "logger = logging.getLogger()\n",
    "logger.setLevel(logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "6b08820f",
   "metadata": {},
   "outputs": [],
   "source": [
    "human_joint_prompts = [\n",
    "    HUMAN_JOINT_PROMPT_1,\n",
    "    HUMAN_JOINT_PROMPT_2,\n",
    "    HUMAN_JOINT_PROMPT_3,\n",
    "    HUMAN_JOINT_PROMPT_4,\n",
    "]\n",
    "\n",
    "models = {\n",
    "    \"gpt_3p5_turbo_0125\": {\"source\": \"OAI\", \"full_name\": \"gpt-3.5-turbo-0125\"},\n",
    "    #     \"gpt_4\": {\"source\": \"OAI\", \"full_name\": \"gpt-4\"},\n",
    "    #     \"gpt_4_turbo_0409\": {\"source\": \"OAI\", \"full_name\": \"gpt-4-turbo-2024-04-09\"},\n",
    "    #     \"llama_2_70b\": {\n",
    "    #         \"source\": \"TOGETHER\",\n",
    "    #         \"full_name\": \"meta-llama/Llama-2-70b-chat-hf\",\n",
    "    #     },\n",
    "    \"llama_3_8b\": {\n",
    "        \"source\": \"TOGETHER\",\n",
    "        \"full_name\": \"meta-llama/Llama-3-8b-chat-hf\",\n",
    "    },\n",
    "    #     \"llama_3_70b\": {\n",
    "    #         \"source\": \"TOGETHER\",\n",
    "    #         \"full_name\": \"meta-llama/Llama-3-70b-chat-hf\",\n",
    "    #     },\n",
    "    \"mistral_8x7b_instruct\": {\n",
    "        \"source\": \"TOGETHER\",\n",
    "        \"full_name\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
    "    },\n",
    "    #     \"claude_3_opus\": {\"source\": \"ANTHROPIC\", \"full_name\": \"claude-3-opus-20240229\"},\n",
    "    #     \"claude_3_sonnet\": {\"source\": \"ANTHROPIC\", \"full_name\": \"claude-3-sonnet-20240229\"},\n",
    "    # \"gemini_pro\": {\"source\": \"GOOGLE\", \"full_name\": \"gemini-pro\"},\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c27a0f3c",
   "metadata": {},
   "source": [
    "### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "6bb57859",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n",
      "Token is valid (permission: write).\n",
      "Your token has been saved to /Users/apple/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:datasets.builder:Found cached dataset json (/Users/apple/.cache/huggingface/datasets/YuehHanChen___json/YuehHanChen--forecasting-0f0bd56f67dc9c3f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f72f9c4e391f42a8888f79206d4050d0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "100"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# log in hf\n",
    "hf_access_token = \"your_token\"\n",
    "!huggingface-cli login --token $hf_access_token\n",
    "\n",
    "dataset_name = \"YuehHanChen/forecasting\"\n",
    "from datasets import load_dataset, Dataset\n",
    "\n",
    "# Load the dataset, specifying the splits if necessary\n",
    "dataset = load_dataset(path=\"YuehHanChen/forecasting\")\n",
    "\n",
    "mini_val = list(dataset[\"validation\"])[:100]\n",
    "len(mini_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39c35588",
   "metadata": {},
   "source": [
    "### General Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "0b5f1a2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_jsonl(file_path):\n",
    "    data = []\n",
    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
    "        for line in file:\n",
    "            if line.strip():\n",
    "                json_object = json.loads(line)\n",
    "                data.append(json_object)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b648a4d",
   "metadata": {},
   "source": [
    "### Scratchpad prompt candidates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "2878dec9",
   "metadata": {},
   "outputs": [],
   "source": [
    "SCRATCHPD_PROMPT_0 = \"\"\"\n",
    "Question:\n",
    "{question}\n",
    "\n",
    "Question Background:\n",
    "{background}\n",
    "\n",
    "Resolution Criteria:\n",
    "{resolution_criteria}\n",
    "\n",
    "Question close date: {close_date}\n",
    "\n",
    "Instructions:\n",
    "1. Provide reasons why the answer might be no.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "2. Provide reasons why the answer might be yes.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "3. Aggregate your considerations.\n",
    "{{ Insert your aggregated considerations }}\n",
    "\n",
    "4. Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.\n",
    "{{ Insert your answer }}\n",
    "\"\"\"\n",
    "\n",
    "SCRATCHPD_PROMPT_1 = \"\"\"\n",
    "Question:\n",
    "{question}\n",
    "\n",
    "Question Background:\n",
    "{background}\n",
    "\n",
    "Resolution Criteria:\n",
    "{resolution_criteria}\n",
    "\n",
    "Question close date: {close_date}\n",
    "\n",
    "Instructions:\n",
    "1. Provide at least 3 reasons why the answer might be no.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "2. Provide at least 3 reasons why the answer might be yes.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "3. Rate the strength of each of the reasons given in the last two responses. Think like a superforecaster (e.g. Nate Silver).\n",
    "{{ Insert your rating of the strength of each reason }}\n",
    "\n",
    "4. Aggregate your considerations.\n",
    "{{ Insert your aggregated considerations }}\n",
    "\n",
    "5. Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.\n",
    "{{ Insert your answer }}\n",
    "\"\"\"\n",
    "\n",
    "SCRATCHPD_PROMPT_2 = \"\"\"\n",
    "Question:\n",
    "{question}\n",
    "\n",
    "Question Background:\n",
    "{background}\n",
    "\n",
    "Resolution Criteria:\n",
    "{resolution_criteria}\n",
    "\n",
    "Question close date: {close_date}\n",
    "\n",
    "Instructions:\n",
    "1. Write down any additional relevant information that is not included above. This should be specific facts that you already know the answer to, rather than information that needs to be looked up.\n",
    "{{ Insert additional information }}\n",
    "\n",
    "2. Provide at least 3 reasons why the answer might be no.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "3. Provide at least 3 reasons why the answer might be yes.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "4. Rate the strength of each of the reasons given in the last two responses. Think like a superforecaster (e.g. Nate Silver).\n",
    "{{ Insert your rating of the strength of each reason }}\n",
    "\n",
    "5. Aggregate your considerations.\n",
    "{{ Insert your aggregated considerations }}\n",
    "\n",
    "6. Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.\n",
    "{{ Insert your answer }}\n",
    "\"\"\"\n",
    "\n",
    "SCRATCHPD_PROMPT_3 = \"\"\"\n",
    "Question:\n",
    "{question}\n",
    "\n",
    "Question Background:\n",
    "{background}\n",
    "\n",
    "Resolution Criteria:\n",
    "{resolution_criteria}\n",
    "\n",
    "Question close date: {close_date}\n",
    "\n",
    "Think step by step: {{ Insert your step by step consideration }}\n",
    "Aggregating considerations: {{ Aggregate your considerations }}\n",
    "Answer: {{ Output your answer (a number between 0 and 1) with an asterisk at the beginning and end of the decimal }}\n",
    "\"\"\"\n",
    "\n",
    "SCRATCHPD_PROMPT_4 = \"\"\"\n",
    "Question:\n",
    "{question}\n",
    "\n",
    "Question Background:\n",
    "{background}\n",
    "\n",
    "Resolution Criteria:\n",
    "{resolution_criteria}\n",
    "\n",
    "Question close date: {close_date}\n",
    "\n",
    "Instructions:\n",
    "1. Given the above question, rephrase and expand it to help you do better answering. Maintain all information in the original question.\n",
    "{{ Insert rephrased and expanded question.}}\n",
    "\n",
    "2. Provide a few reasons why the answer might be no. Rate the strength of each reason.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "3. Provide a few reasons why the answer might be yes. Rate the strength of each reason.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "4. Aggregate your considerations. Think like a superforecaster (e.g. Nate Silver).\n",
    "{{ Insert your aggregated considerations }}\n",
    "\n",
    "5. Output an initial probability (prediction) given steps 1-4.\n",
    "{{ Insert initial probability. }}\n",
    "\n",
    "6. Evaluate whether your calculated probability is excessively confident or not confident enough. Also, consider anything else that might affect the forecast that you did not before consider.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "7. Output your final prediction (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.\n",
    "{{ Insert your answer }}\n",
    "\"\"\"\n",
    "\n",
    "SCRATCHPD_PROMPT_5 = \"\"\"\n",
    "Question:\n",
    "{question}\n",
    "\n",
    "Question Background:\n",
    "{background}\n",
    "\n",
    "Resolution Criteria:\n",
    "{resolution_criteria}\n",
    "\n",
    "Question close date: {close_date}\n",
    "\n",
    "\n",
    "Instructions:\n",
    "1. Given the above question, rephrase and expand it to help you do better answering. Maintain all information in the original question.\n",
    "{{ Insert rephrased and expanded question.}}\n",
    "\n",
    "2. Provide a few reasons why the answer might be \"yes\" or \"no\". Rate the strength of each reason.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "3. Consider anything else that might affect the forecast that you did not before consider.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "4. Aggregate your considerations. Think like a superforecaster (e.g. Nate Silver).\n",
    "{{ Insert your aggregated considerations }}\n",
    "\n",
    "5. Output an initial probability (prediction) given steps 1-4.\n",
    "{{ Insert initial probability. }}\n",
    "\n",
    "6. Evaluate whether your calculated probability is excessively confident or not confident enough.\n",
    "{{ Insert your thoughts }}\n",
    "\n",
    "7. Output your final prediction (a number between 0 and 1) with an asterisk at the beginning and end of the decimal.\n",
    "{{ Insert your answer }}\n",
    "\"\"\"\n",
    "\n",
    "all_prompts = [\n",
    "    SCRATCHPD_PROMPT_0,\n",
    "    SCRATCHPD_PROMPT_1,\n",
    "    SCRATCHPD_PROMPT_2,\n",
    "    SCRATCHPD_PROMPT_3,\n",
    "    SCRATCHPD_PROMPT_4,\n",
    "    SCRATCHPD_PROMPT_5,\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "b6f61de7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def worker(index, prompt, model_name, save_dict):\n",
    "    if save_dict[index] != \"\":\n",
    "        return\n",
    "\n",
    "    logger.info(f\"Starting question: {index}\")\n",
    "    prompt = SCRATCH_PAD_PROMPT.format(\n",
    "        question=mini_val[index][\"question\"],\n",
    "        background=mini_val[index][\"background\"],\n",
    "        resolution_criteria=mini_val[index][\"resolution_criteria\"],\n",
    "        close_date=mini_val[index][\"date_resolve_at\"],\n",
    "    )\n",
    "\n",
    "    response = model_eval.get_response_from_model(\n",
    "        prompt=prompt,\n",
    "        max_tokens=1300,\n",
    "        model_name=models[model_name][\"full_name\"],\n",
    "        temperature=0,\n",
    "        wait_time=30,\n",
    "    )\n",
    "\n",
    "    logger.info(f\"Finished question: {index}\")\n",
    "\n",
    "    save_dict[index] = response\n",
    "\n",
    "    return None\n",
    "\n",
    "\n",
    "def executor(max_workers, prompt, model_name, save_dict):\n",
    "    with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "\n",
    "        worker_with_args = partial(\n",
    "            worker, prompt=prompt, model_name=model_name, save_dict=save_dict\n",
    "        )\n",
    "        return list(executor.map(worker_with_args, range(len(questions_list))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "fb80b036",
   "metadata": {},
   "outputs": [],
   "source": [
    "base = \"zero_shot_prompt_eval/\"\n",
    "\n",
    "results = {}\n",
    "questions_list = [d[\"question\"] for d in mini_val]\n",
    "model_result_loaded = {}\n",
    "\n",
    "for prompt_index in range(len(all_prompts)):\n",
    "    for model in models:\n",
    "        if model not in model_result_loaded.keys():\n",
    "            model_result_loaded[model] = {}\n",
    "        model_result_loaded[model][f\"prompt_{prompt_index}\"] = False\n",
    "\n",
    "for prompt_index in range(len(all_prompts)):\n",
    "    for model in models:\n",
    "        file_path = f\"{base}/{prompt_index}/{model}.jsonl\"\n",
    "\n",
    "        if model not in results.keys():\n",
    "            results[model] = {}\n",
    "        try:\n",
    "            results[model] = read_jsonl(file_path)\n",
    "            model_result_loaded[model][\n",
    "                f\"prompt_{prompt_index}\"\n",
    "            ] = True  # Set flag to True if loaded successfully\n",
    "        except:\n",
    "            results[model][f\"prompt_{prompt_index}\"] = {i: \"\" for i in range(len(questions_list))}\n",
    "\n",
    "for prompt_index in range(len(all_prompts)):\n",
    "    for model, info in models.items():\n",
    "        # only execute the model if we have not had its results yet\n",
    "        if not model_result_loaded[model][f\"prompt_{prompt_index}\"]:\n",
    "            executor_count = 50\n",
    "            executor(\n",
    "                executor_count,\n",
    "                all_prompts[prompt_index],\n",
    "                model,\n",
    "                results[model][f\"prompt_{prompt_index}\"],\n",
    "            )\n",
    "\n",
    "for prompt_index in range(len(all_prompts)):\n",
    "    for model in models:\n",
    "        file_path = f\"{base}/{prompt_index}/{model}.jsonl\"\n",
    "        if not model_result_loaded[model][f\"prompt_{prompt_index}\"]:\n",
    "            os.makedirs(os.path.dirname(file_path), exist_ok=True)\n",
    "            with open(file_path, \"w\") as f:\n",
    "                json.dump(results[model][f\"prompt_{prompt_index}\"], f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1243dd3",
   "metadata": {},
   "source": [
    "### Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "31f1a849",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "\n",
    "def extract_probability(text):\n",
    "    # Pattern to find numbers enclosed in asterisks, possibly with a percentage sign\n",
    "    pattern = r\"\\*([\\d\\.]+)%?\\*\"\n",
    "\n",
    "    # Find all matches in the text\n",
    "    matches = re.findall(pattern, text)\n",
    "\n",
    "    # Process matches to find the valid probability\n",
    "    for match in reversed(matches):\n",
    "        # Convert to float\n",
    "        number = float(match)\n",
    "\n",
    "        # Convert from percentage to decimal if necessary\n",
    "        if \"%\" in text[text.find(f\"*{match}*\") - 1]:\n",
    "            number /= 100\n",
    "\n",
    "        # Check if the number is a valid probability\n",
    "        if 0 <= number <= 1:\n",
    "            return number\n",
    "\n",
    "    # Return None if no valid probability found\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa7fb599",
   "metadata": {},
   "outputs": [],
   "source": [
    "for prompt_index in range(len(all_prompts)):\n",
    "    for model in results.keys():\n",
    "        refuse_to_answer_cnt = 0\n",
    "        for key, response in results[model][f\"prompt_{prompt_index}\"].items():\n",
    "            answer = extract_probability(response)\n",
    "            if answer == None:\n",
    "                answer = 0.5\n",
    "                refuse_to_answer_cnt += 1\n",
    "\n",
    "            results[model][f\"prompt_{prompt_index}\"][key] = answer\n",
    "        results[model][f\"prompt_{prompt_index}\"][\"refuse_to_answer_cnt\"] = refuse_to_answer_cnt\n",
    "\n",
    "\n",
    "def brier_score(prediction, answer):\n",
    "    return (prediction - answer) ** 2\n",
    "\n",
    "\n",
    "brier_scores = pd.DataFrame()\n",
    "\n",
    "for prompt_index in range(len(all_prompts)):\n",
    "    for model in results.keys():\n",
    "        brier_scores_model = []\n",
    "        for question_id, prediction in results[model][f\"prompt_{prompt_index}\"].items():\n",
    "            if question_id != \"refuse_to_answer_cnt\":\n",
    "                brier_score_value = brier_score(\n",
    "                    float(prediction), mini_val[question_id][\"resolution\"]\n",
    "                )\n",
    "                brier_scores_model.append(brier_score_value)\n",
    "\n",
    "        avg_brier_score = sum(brier_scores_model) / len(brier_scores_model)\n",
    "        ç\n",
    "        brier_scores.at[model, f\"Prompt {prompt_index}\"] = avg_brier_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "c25d2c81",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Prompt 0</th>\n",
       "      <th>Prompt 1</th>\n",
       "      <th>Prompt 2</th>\n",
       "      <th>Prompt 3</th>\n",
       "      <th>Prompt 4</th>\n",
       "      <th>Prompt 5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>gpt_3p5_turbo_0125</th>\n",
       "      <td>0.271800</td>\n",
       "      <td>0.271300</td>\n",
       "      <td>0.272400</td>\n",
       "      <td>0.277525</td>\n",
       "      <td>0.257850</td>\n",
       "      <td>0.278450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>llama_3_8b</th>\n",
       "      <td>0.249894</td>\n",
       "      <td>0.251463</td>\n",
       "      <td>0.251000</td>\n",
       "      <td>0.256625</td>\n",
       "      <td>0.255944</td>\n",
       "      <td>0.250775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mistral_8x7b_instruct</th>\n",
       "      <td>0.264750</td>\n",
       "      <td>0.264580</td>\n",
       "      <td>0.260813</td>\n",
       "      <td>0.262000</td>\n",
       "      <td>0.269800</td>\n",
       "      <td>0.264475</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Prompt 0  Prompt 1  Prompt 2  Prompt 3  Prompt 4  \\\n",
       "gpt_3p5_turbo_0125     0.271800  0.271300  0.272400  0.277525  0.257850   \n",
       "llama_3_8b             0.249894  0.251463  0.251000  0.256625  0.255944   \n",
       "mistral_8x7b_instruct  0.264750  0.264580  0.260813  0.262000  0.269800   \n",
       "\n",
       "                       Prompt 5  \n",
       "gpt_3p5_turbo_0125     0.278450  \n",
       "llama_3_8b             0.250775  \n",
       "mistral_8x7b_instruct  0.264475  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "brier_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "15af39f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Prompt 0    0.262148\n",
       "Prompt 1    0.262448\n",
       "Prompt 2    0.261404\n",
       "Prompt 3    0.265383\n",
       "Prompt 4    0.261198\n",
       "Prompt 5    0.264567\n",
       "dtype: float64"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "brier_scores.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dadc1aaa",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
