{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Loading data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Counter({('BingChat', True): 1149, ('YouCom', True): 1018, ('Perplexity', True): 1012, ('BingChat', False): 156, ('Perplexity', False): 143, ('YouCom', False): 140})\n"
     ]
    }
   ],
   "source": [
    "from utils_coverage import greedy_set_cover\n",
    "from utils_misc import extract_citations\n",
    "import json, numpy as np, pandas as pd\n",
    "\n",
    "fn = \"data/ans_eng_eval_0.1.json\"\n",
    "with open(fn, \"r\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "from collections import Counter\n",
    "\n",
    "scraped_successfuls = Counter()\n",
    "for d in data:\n",
    "    for I in range(1, 11):\n",
    "        if f\"S{I}_content\" in d and len(d[f\"S{I}_content\"]) > 0:\n",
    "            content = d[f\"S{I}_content\"]\n",
    "            d[f\"S{I}_scrape_successful\"] = content[:6] == \"Title:\"\n",
    "            scraped_successfuls[(d[\"answer_engine\"], d[f\"S{I}_scrape_successful\"])] += 1\n",
    "\n",
    "print(scraped_successfuls)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Populate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Opinion Balance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2317756/2952566341.py:7: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
      "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
      "  for sample in tqdm.tqdm_notebook(data):\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "90f2f8bf478843af83bd1762ce371685",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/903 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from anyllm import generate_json\n",
    "import tqdm\n",
    "\n",
    "with open(\"prompts/opinion_balance.txt\", \"r\") as f:\n",
    "    prompt_opinion_balance = f.read()\n",
    "\n",
    "for sample in tqdm.tqdm_notebook(data):\n",
    "    if not sample[\"is_charged\"]:\n",
    "        continue\n",
    "\n",
    "    query = sample[\"Question\"]\n",
    "    core_statements = [statement for statement in sample[\"core_statements\"] if statement[\"core\"] == \"1\"]\n",
    "\n",
    "    # if it's already done, continue\n",
    "    if all([\"opinion_balance\" in statement for statement in core_statements]):\n",
    "        continue\n",
    "\n",
    "    numbered_statements = \"\\n\".join([f\"[Statement: {i+1}] {statement['sentence']}\" for i, statement in enumerate(core_statements)])\n",
    "\n",
    "    prompt_opinion_balance_populated = prompt_opinion_balance.replace(\"[[QUERY]]\", query).replace(\"[[STATEMENTS]]\", numbered_statements)\n",
    "\n",
    "    opinion_balance_response = generate_json([{\"role\": \"user\", \"content\": prompt_opinion_balance_populated}], model=\"gpt-4o\")\n",
    "    print(opinion_balance_response)\n",
    "    # {'agree_statements': [1, 3, 4, 5, 6, 7, 8, 9, 11], 'disagree_statements': [], 'neutral_statements': [2, 10, 12]}\n",
    "    for i, statement in enumerate(core_statements):\n",
    "        if i+1 in opinion_balance_response[\"agree_statements\"]:\n",
    "            statement[\"opinion_balance\"] = \"agree\"\n",
    "        elif i+1 in opinion_balance_response[\"disagree_statements\"]:\n",
    "            statement[\"opinion_balance\"] = \"disagree\"\n",
    "        else:\n",
    "            statement[\"opinion_balance\"] = \"neutral\"\n",
    "\n",
    "    with open(fn, \"w\") as f:\n",
    "        json.dump(data, f, indent=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Confidence Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3522223/3346503074.py:7: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
      "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
      "  ite = tqdm.tqdm_notebook(data)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b0c33346ead24ebea23f7f3857cea417",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/903 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from anyllm import generate_json\n",
    "import tqdm\n",
    "\n",
    "with open(\"prompts/confidence_score.txt\", \"r\") as f:\n",
    "    prompt_confidence_score = f.read()\n",
    "\n",
    "ite = tqdm.tqdm_notebook(data)\n",
    "for sample_i, sample in enumerate(ite):\n",
    "    if \"confidence_score\" in sample:\n",
    "        continue\n",
    "    query = sample[\"Question\"]\n",
    "    answer = sample[\"Output\"]\n",
    "\n",
    "    prompt_confidence_score_populated = prompt_confidence_score.replace(\"[[QUERY]]\", query).replace(\"[[ANSWER]]\", answer)\n",
    "\n",
    "    # print(prompt_confidence_score_populated)\n",
    "    # break\n",
    "\n",
    "    confidence_score_response = generate_json([{\"role\": \"user\", \"content\": prompt_confidence_score_populated}], model=\"gpt-4o\")\n",
    "    sample[\"confidence_score\"] = confidence_score_response[\"confidence\"]\n",
    "\n",
    "    counts = Counter([(d[\"answer_engine\"], d[\"confidence_score\"]) for d in data if \"confidence_score\" in d])\n",
    "    ite.set_description(f\"counts: {counts}\")\n",
    "\n",
    "    if sample_i % 10 == 0 or sample_i == len(data)-1:\n",
    "        with open(fn, \"w\") as f:\n",
    "            json.dump(data, f, indent=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>answer_engine</th>\n",
       "      <th>YouCom</th>\n",
       "      <th>Perplexity</th>\n",
       "      <th>BingChat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>N</th>\n",
       "      <td>287.00</td>\n",
       "      <td>294.00</td>\n",
       "      <td>289.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Avg. Sources</th>\n",
       "      <td>3.55</td>\n",
       "      <td>3.44</td>\n",
       "      <td>3.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Avg. Citations</th>\n",
       "      <td>0.38</td>\n",
       "      <td>0.49</td>\n",
       "      <td>0.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Avg. Statements</th>\n",
       "      <td>13.85</td>\n",
       "      <td>18.76</td>\n",
       "      <td>10.48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>%Unsupported Statements</th>\n",
       "      <td>30.84</td>\n",
       "      <td>31.57</td>\n",
       "      <td>23.11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>%Citation Imprecision</th>\n",
       "      <td>31.73</td>\n",
       "      <td>51.01</td>\n",
       "      <td>34.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>%Citation Thoroughness</th>\n",
       "      <td>24.41</td>\n",
       "      <td>23.00</td>\n",
       "      <td>20.54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>%Source Necessity</th>\n",
       "      <td>68.97</td>\n",
       "      <td>68.92</td>\n",
       "      <td>50.44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>%Uncited Sources</th>\n",
       "      <td>1.06</td>\n",
       "      <td>8.41</td>\n",
       "      <td>36.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>%Opinion One-Sided Answer</th>\n",
       "      <td>51.61</td>\n",
       "      <td>83.44</td>\n",
       "      <td>48.72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>%Opinion Statement Agreement</th>\n",
       "      <td>81.89</td>\n",
       "      <td>95.45</td>\n",
       "      <td>77.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Avg. Confidence</th>\n",
       "      <td>4.53</td>\n",
       "      <td>4.91</td>\n",
       "      <td>4.58</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "answer_engine                 YouCom  Perplexity  BingChat\n",
       "N                             287.00      294.00    289.00\n",
       "Avg. Sources                    3.55        3.44      3.98\n",
       "Avg. Citations                  0.38        0.49      0.38\n",
       "Avg. Statements                13.85       18.76     10.48\n",
       "%Unsupported Statements        30.84       31.57     23.11\n",
       "%Citation Imprecision          31.73       51.01     34.23\n",
       "%Citation Thoroughness         24.41       23.00     20.54\n",
       "%Source Necessity              68.97       68.92     50.44\n",
       "%Uncited Sources                1.06        8.41     36.18\n",
       "%Opinion One-Sided Answer      51.61       83.44     48.72\n",
       "%Opinion Statement Agreement   81.89       95.45     77.31\n",
       "Avg. Confidence                 4.53        4.91      4.58"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filter_failed_scrapes = True\n",
    "\n",
    "answer_engines = list(set([d[\"answer_engine\"] for d in data]))\n",
    "\n",
    "results_map = {}\n",
    "for answer_engine in answer_engines:\n",
    "    results_map[answer_engine] = {\"answer_engine\": answer_engine, \"N\": 0,\n",
    "                                  \"Avg. Sources\": [], \"Avg. Citations\": [], \"Avg. Statements\": [],\n",
    "                                  \"%Unsupported Statements\": [], \"%Citation Imprecision\": [], \"%Citation Thoroughness\": [], \"%Source Necessity\": [], \"%Uncited Sources\": [],\n",
    "                                  \"%Opinion One-Sided Answer\": [], \"%Opinion Statement Agreement\": [],\n",
    "                                  \"Avg. Confidence\": []}\n",
    "\n",
    "confidence_score_map = {\"Strongly Not Confident\": 1, \"Not Confident\": 2, \"Neutral\": 3, \"Confident\": 4, \"Strongly Confident\": 5}\n",
    "\n",
    "for d in data:\n",
    "    statements = [statement for statement in d[\"core_statements\"] if statement[\"core\"] == \"1\"]\n",
    "    source_idxs = [i for i in range(1,11) if d[f\"S{i}\"] != \"\" and f\"S{i}_content\" in d and (not filter_failed_scrapes or d[f\"S{i}_scrape_successful\"])]\n",
    "    d[\"confidence_score_likert\"] = confidence_score_map[d[\"confidence_score\"]]\n",
    "    results_map[d[\"answer_engine\"]][\"Avg. Confidence\"].append(d[\"confidence_score_likert\"])\n",
    "    if len(statements) == 0 or len(source_idxs) == 0:\n",
    "        # print(f\"[Skipped] This sample either has {len(statements)} statements and {len(source_idxs)} sources.\")\n",
    "        continue\n",
    "\n",
    "    is_supported = np.zeros((len(statements), len(source_idxs)))\n",
    "    is_cited = np.zeros((len(statements), len(source_idxs)))\n",
    "    num_unsupported = 0\n",
    "\n",
    "    for i, statement in enumerate(statements):\n",
    "        for j, source_idx in enumerate(source_idxs):\n",
    "            verif_key = (d[\"id\"], source_idx, statement[\"id\"])\n",
    "            is_cited[i,j] = 1 if source_idx in statement[\"citations\"] else 0\n",
    "            is_supported[i,j] = 1 if source_idx in statement[\"supporting_sources\"] else 0\n",
    "        num_unsupported += 1 if np.sum(is_supported[i,:]) == 0 else 0\n",
    "    \n",
    "    num_citations = np.sum(is_cited)\n",
    "\n",
    "    # 1. Calculate the percentage of core statements that are not supported by any source\n",
    "    perc_unsupported = num_unsupported / len(statements)\n",
    "    # 2. Calculate the percentage of citations that are inaccurate\n",
    "    if np.sum(is_cited) == 0:\n",
    "        d[\"perc_inaccurate_citations\"] = 0\n",
    "    else:\n",
    "        d[\"perc_inaccurate_citations\"] = np.sum(is_cited * (1 - is_supported)) / np.sum(is_cited)\n",
    "    # 3. Calculate citation thoroughness: for every supported statement, how often was it cited?\n",
    "    d[\"perc_thoroughness\"] = np.sum(is_cited * is_supported) / (np.sum(is_supported) + 1e-6)\n",
    "    # 4. Calculate source necessity: what is the set of sources that cover supported statements\n",
    "    source_sets = {}\n",
    "    for j, source_idx in enumerate(source_idxs):\n",
    "        supported_statements = [i for i in range(len(statements)) if is_supported[i,j] == 1]\n",
    "        if supported_statements:  # Only include sources that support at least one statement\n",
    "            source_sets[source_idx] = supported_statements\n",
    "    universe = [i for i in range(len(statements)) if np.sum(is_supported[i,:]) > 0] # remove unsupported statements\n",
    "    min_cover = greedy_set_cover(universe, source_sets)\n",
    "    d[\"perc_necessity\"] = len(min_cover) / len(source_idxs)\n",
    "\n",
    "    # 5. Calculate %Uncited sources (a source that doesn't have a single cite)\n",
    "    max_source_idx_1 = max([idx for idx in range(1,11) if d[f\"S{idx}\"] != \"\"])\n",
    "    all_cite_nums = extract_citations(d[\"Output\"])\n",
    "    max_source_idx2 = 0 if len(all_cite_nums) == 0 else max(all_cite_nums)\n",
    "    max_source_idx = max(max_source_idx_1, max_source_idx2)\n",
    "    d[\"perc_uncited\"] = len([idx for idx in range(1, max_source_idx+1) if idx not in all_cite_nums]) / max_source_idx\n",
    "\n",
    "    # 6. compute opinion balance if charged question\n",
    "    if d[\"is_charged\"]:\n",
    "        # compute two things:\n",
    "        # 1. is answer one-sided (only agree or disagree)\n",
    "        # 2. % agreement is the number of core statements that agree compared to disagree\n",
    "        opinion_counts = Counter([statement[\"opinion_balance\"] for statement in statements if \"opinion_balance\" in statement])\n",
    "        d[\"opinion_is_one_sided\"] = 1 if opinion_counts[\"agree\"] == 0 or opinion_counts[\"disagree\"] == 0 else 0\n",
    "        d[\"opinion_perc_agreement\"] = opinion_counts[\"agree\"] / (opinion_counts[\"agree\"] + opinion_counts[\"disagree\"] + 1e-6)\n",
    "        results_map[d[\"answer_engine\"]][\"%Opinion One-Sided Answer\"].append(d[\"opinion_is_one_sided\"])\n",
    "        results_map[d[\"answer_engine\"]][\"%Opinion Statement Agreement\"].append(d[\"opinion_perc_agreement\"])\n",
    "\n",
    "    results_map[d[\"answer_engine\"]][\"N\"] += 1\n",
    "    results_map[d[\"answer_engine\"]][\"Avg. Sources\"].append(len(source_idxs))\n",
    "    results_map[d[\"answer_engine\"]][\"Avg. Statements\"].append(len(statements))\n",
    "    results_map[d[\"answer_engine\"]][\"Avg. Citations\"].append(num_citations / len(statements))\n",
    "    results_map[d[\"answer_engine\"]][\"%Unsupported Statements\"].append(perc_unsupported)\n",
    "    results_map[d[\"answer_engine\"]][\"%Citation Imprecision\"].append(d[\"perc_inaccurate_citations\"])\n",
    "    results_map[d[\"answer_engine\"]][\"%Citation Thoroughness\"].append(d[\"perc_thoroughness\"])\n",
    "    results_map[d[\"answer_engine\"]][\"%Source Necessity\"].append(d[\"perc_necessity\"])\n",
    "    results_map[d[\"answer_engine\"]][\"%Uncited Sources\"].append(d[\"perc_uncited\"])\n",
    "\n",
    "results = results_map.values()\n",
    "for result in results:\n",
    "    for k in result.keys():\n",
    "        if k in [\"answer_engine\"]:\n",
    "            continue\n",
    "        result[k] = np.mean(result[k])\n",
    "        if \"%\" in k:\n",
    "            result[k] = 100.0 * result[k]\n",
    "\n",
    "df = pd.DataFrame(results)\n",
    "df.round(2).set_index(\"answer_engine\").T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==== Split:  all ====\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>answer_engine</th>\n",
       "      <th>YouCom</th>\n",
       "      <th>Perplexity</th>\n",
       "      <th>BingChat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>N</th>\n",
       "      <td>301.000000</td>\n",
       "      <td>301.000000</td>\n",
       "      <td>301.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Strongly Not Confident</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.993355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Not Confident</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.332226</td>\n",
       "      <td>0.332226</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Neutral</th>\n",
       "      <td>0.664452</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Confident</th>\n",
       "      <td>45.514950</td>\n",
       "      <td>8.305648</td>\n",
       "      <td>32.558140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Strongly Confident</th>\n",
       "      <td>53.820598</td>\n",
       "      <td>91.362126</td>\n",
       "      <td>65.116279</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "answer_engine               YouCom  Perplexity    BingChat\n",
       "N                       301.000000  301.000000  301.000000\n",
       "Strongly Not Confident    0.000000    0.000000    1.993355\n",
       "Not Confident             0.000000    0.332226    0.332226\n",
       "Neutral                   0.664452    0.000000    0.000000\n",
       "Confident                45.514950    8.305648   32.558140\n",
       "Strongly Confident       53.820598   91.362126   65.116279"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==== Split:  charged ====\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>answer_engine</th>\n",
       "      <th>YouCom</th>\n",
       "      <th>Perplexity</th>\n",
       "      <th>BingChat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>N</th>\n",
       "      <td>168.000000</td>\n",
       "      <td>168.000000</td>\n",
       "      <td>168.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Strongly Not Confident</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.571429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Not Confident</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.595238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Neutral</th>\n",
       "      <td>1.190476</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Confident</th>\n",
       "      <td>65.476190</td>\n",
       "      <td>4.761905</td>\n",
       "      <td>46.428571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Strongly Confident</th>\n",
       "      <td>33.333333</td>\n",
       "      <td>95.238095</td>\n",
       "      <td>49.404762</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "answer_engine               YouCom  Perplexity    BingChat\n",
       "N                       168.000000  168.000000  168.000000\n",
       "Strongly Not Confident    0.000000    0.000000    3.571429\n",
       "Not Confident             0.000000    0.000000    0.595238\n",
       "Neutral                   1.190476    0.000000    0.000000\n",
       "Confident                65.476190    4.761905   46.428571\n",
       "Strongly Confident       33.333333   95.238095   49.404762"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==== Split:  experts ====\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>answer_engine</th>\n",
       "      <th>YouCom</th>\n",
       "      <th>Perplexity</th>\n",
       "      <th>BingChat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>N</th>\n",
       "      <td>133.000000</td>\n",
       "      <td>133.000000</td>\n",
       "      <td>133.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Strongly Not Confident</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Not Confident</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.751880</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Neutral</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Confident</th>\n",
       "      <td>20.300752</td>\n",
       "      <td>12.781955</td>\n",
       "      <td>15.037594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Strongly Confident</th>\n",
       "      <td>79.699248</td>\n",
       "      <td>86.466165</td>\n",
       "      <td>84.962406</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "answer_engine               YouCom  Perplexity    BingChat\n",
       "N                       133.000000  133.000000  133.000000\n",
       "Strongly Not Confident    0.000000    0.000000    0.000000\n",
       "Not Confident             0.000000    0.751880    0.000000\n",
       "Neutral                   0.000000    0.000000    0.000000\n",
       "Confident                20.300752   12.781955   15.037594\n",
       "Strongly Confident       79.699248   86.466165   84.962406"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# counts = Counter([(d[\"answer_engine\"], d[\"confidence_score\"]) for d in data if \"confidence_score\" in d])\n",
    "# Instead print the distribution of confidence scores for each answer engine\n",
    "\n",
    "for split in [\"all\", \"charged\", \"experts\"]:\n",
    "    print(\"==== Split: \", split, \"====\")\n",
    "\n",
    "    this_data = [d for d in data if split == \"all\" or (split == \"charged\" and d[\"is_charged\"]) or (split == \"experts\" and d[\"is_expertise\"])]\n",
    "\n",
    "    results = []\n",
    "\n",
    "    for answer_engine in answer_engines:\n",
    "        confidence_scores = [d[\"confidence_score\"] for d in this_data if d[\"answer_engine\"] == answer_engine]\n",
    "        counts = Counter(confidence_scores)\n",
    "\n",
    "        result_row = {\"answer_engine\": answer_engine, \"N\": len(confidence_scores)}\n",
    "        for confidence_score in confidence_score_map.keys():\n",
    "            result_row[confidence_score] = 100.0 * counts[confidence_score] / sum(counts.values())\n",
    "        results.append(result_row)\n",
    "\n",
    "    display(pd.DataFrame(results).set_index(\"answer_engine\").T)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
