{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from results import *\n",
    "import numpy as np\n",
    "from scipy.special import binom, betaln\n",
    "from scipy.stats import chi2, norm"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "# pandas show full column contents\n",
    "pd.set_option('display.max_colwidth', None)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/3l/bhvwn3fx10x16r6xk6188lq40000gn/T/ipykernel_7831/486197287.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
      "  from IPython.core.display import display, HTML\n"
     ]
    },
    {
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "<style>.container {width:100% !important;}</style>"
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# make Jupyter Notebook in the browser full width\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML('<style>.container {width:100% !important;}</style>'))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "def binomln(x,y):\n",
    "    return - np.log(x+1) - betaln(x-y+1,y+1)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "def mcnemar_exact(results_table):\n",
    "    def f(x, nd):\n",
    "        return np.exp(binomln(nd, x) - nd * np.log(2))\n",
    "\n",
    "    x01, x10 = results_table[0,1], results_table[1,0]\n",
    "    nd = x01 + x10\n",
    "    p = sum([f(x,nd) for x in range(min(x01,x10)+1)])\n",
    "    return p"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "def mcnemar_asymptotic(results_table):\n",
    "    x01, x10 = results_table[0,1], results_table[1,0]\n",
    "    nd = x01 + x10\n",
    "    z = np.abs((x01 - x10)/np.sqrt(nd))\n",
    "    p = norm.cdf(z)\n",
    "    return 1-p"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "def binomial_asymptotic(results_table):\n",
    "    n1 = results_table[0,1] + results_table[1,1]\n",
    "    n2 = results_table[1,0] + results_table[1,1]\n",
    "    n = results_table.sum()\n",
    "    z = np.abs(n1-n2)/np.sqrt((n1+n2)*(1-(n1+n2)/(2*n)))\n",
    "    p = norm.cdf(z)\n",
    "    return 1-p"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "def binomial_exact(results_table):\n",
    "    A = results_table[0,0] + results_table[1,0]\n",
    "    B = results_table[1,1] + results_table[0,1]\n",
    "    C = results_table[0,0] + results_table[0,1]\n",
    "    D = results_table[1,1] + results_table[1,0]\n",
    "    N = A + B + C + D\n",
    "    p = np.exp(binomln(A+C,A) + binomln(B+D,B) - binomln(N, A+B))\n",
    "    while min(A,D) > 0:\n",
    "        A -= 1\n",
    "        D -= 1\n",
    "        B += 1\n",
    "        C += 1\n",
    "        p += np.exp(binomln(A+C,A) + binomln(B+D,B) - binomln(N, A+B))\n",
    "    return 1-p"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [],
   "source": [
    "def value_normalizer(v):\n",
    "    v = str(v).strip(\". '\")\n",
    "    if v.endswith(\".0\"):\n",
    "        v = v[:-2]\n",
    "    return v"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "outputs": [],
   "source": [
    "def get_results_table(task_results):\n",
    "    results_correct = []\n",
    "    target = None\n",
    "\n",
    "    for results in task_results:\n",
    "        result = results.data()[[\"target\",\"correct\",\"query_file\"]]\n",
    "        result = result.set_index(result.apply(lambda x: int(x['query_file'].split(\"/\")[-1].strip(\".lmql\")),axis=1)).drop(\"query_file\",axis=1).sort_index()\n",
    "        if target is not None:\n",
    "            assert all(result[\"target\"] == target)\n",
    "        target = result[\"target\"]\n",
    "        correct = result[\"correct\"]\n",
    "        correct = np.array(correct)\n",
    "        results_correct.append(correct)\n",
    "\n",
    "    results_correct = np.stack(results_correct, axis=0)\n",
    "    results_table = np.array([[(~results_correct).all(0).sum(), (results_correct[0]>results_correct[1]).sum()], [(results_correct[0]<results_correct[1]).sum(), results_correct.all(0).sum()]])\n",
    "    return results_table"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [],
   "source": [
    "keys_ours = ['multi@argmax', 'multi@var', 'multi@beam_search', 'multi@beam_var', 'cot@var', 'cot@beam_var']\n",
    "keys_prior = ['cot@beam_search', 'cot@argmax', 'ao@argmax']"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Runs with 100 samples"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [],
   "source": [
    "datasets = {}\n",
    "\n",
    "with open(\"paper_files.txt\") as f:\n",
    "    for line in f:\n",
    "        if \"valuating_information_essentiality\" in line:\n",
    "            key = \"infoess\"\n",
    "        else:\n",
    "            key = line.split(\"@\")[0][len(\"results/\"):]\n",
    "        if key == \"multiarith2\": key = \"multiarith\"\n",
    "        decoder = line.split(\"-2023\")[0].split(\"-\")[-1]\n",
    "        if decoder == \"bsseq\":\n",
    "            decoder = \"beam_search\"\n",
    "        prompt = \"ao\" if \"@ao\" in line else (\"cot\" if \"@cot\" in line else \"multi\")\n",
    "        datasets.setdefault(key, {})[prompt + \"@\" + decoder] = line.strip()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Analysing date_understanding\n",
      "Best prior method: cot@beam_search with acc: 0.750; Best our method: cot@beam_var with acc: 0.750\n",
      "Result statistically significant with confidence 0.3872\n",
      "Analysing infoess\n",
      "Best our method: multi@beam_var with acc: 0.250\n",
      "Analysing AQUA\n",
      "Best prior method: cot@argmax with acc: 0.370; Best our method: multi@beam_var with acc: 0.470\n",
      "Result statistically significant with confidence 0.9179\n",
      "Analysing sqauni\n",
      "Best prior method: cot@beam_search with acc: 0.720; Best our method: multi@beam_var with acc: 0.770\n",
      "Result statistically significant with confidence 0.8083\n",
      "Analysing multiarith\n",
      "Best prior method: cot@beam_search with acc: 0.440; Best our method: cot@beam_var with acc: 0.490\n",
      "Result statistically significant with confidence 0.8491\n",
      "Analysing GSM8\n",
      "Best prior method: cot@beam_search with acc: 0.580; Best our method: cot@beam_var with acc: 0.640\n",
      "Result statistically significant with confidence 0.9102\n",
      "Analysing tracking_shuffled_objects\n",
      "Best prior method: cot@argmax with acc: 0.620; Best our method: multi@var with acc: 0.660\n",
      "Result statistically significant with confidence 0.7017\n",
      "Analysing matrixshapes\n",
      "Best prior method: cot@beam_search with acc: 0.770; Best our method: multi@var with acc: 0.850\n",
      "Result statistically significant with confidence 0.9423\n"
     ]
    }
   ],
   "source": [
    "for key in datasets.keys():\n",
    "    print(f\"Analysing {key}\")\n",
    "    data_files = {k: v.split(\"/\")[-1][:-4] + \".json\" for k, v in datasets[key].items()}\n",
    "    task_results = {k: TaskResult(os.path.join(\"results\", r)) for k, r in data_files.items()}\n",
    "    _ = [x.data() for x in task_results.values()]\n",
    "    if \"GSM8\" in key:\n",
    "        t = value_normalizer\n",
    "    elif \"sqauni\":\n",
    "        t = lambda x: str(x)\n",
    "    else:\n",
    "        t = lambda x: x\n",
    "    accuracies = {}\n",
    "    for k, r in task_results.items():\n",
    "        task_results[k]._data[\"correct\"] =  r.data()[\"target\"].apply(t) == r.data()[\"prediction\"].apply(t)\n",
    "        accuracies[k] = np.array(task_results[k]._data[\"correct\"]).astype(float).mean()\n",
    "    accuracies_ours = sorted([(k, v) for k, v in accuracies.items() if k in keys_ours], key = lambda x: x[1], reverse=True)\n",
    "    accuracies_prior = sorted([(k, v) for k, v in accuracies.items() if k in keys_prior], key = lambda x: x[1], reverse=True)\n",
    "\n",
    "    best_ours_key = accuracies_ours[0][0]\n",
    "    if len(accuracies_prior) == 0:\n",
    "        print(f\"Best our method: {best_ours_key} with acc: {accuracies_ours[0][1]:.3f}\")\n",
    "        continue\n",
    "    best_prior_key = accuracies_prior[0][0]\n",
    "    print(f\"Best prior method: {best_prior_key} with acc: {accuracies[best_prior_key]:.3f}; Best our method: {best_ours_key} with acc: {accuracies[best_ours_key]:.3f}\")\n",
    "\n",
    "    try:\n",
    "        results_table = get_results_table([task_results[best_ours_key], task_results[best_prior_key]])\n",
    "        print(f\"Result statistically significant with confidence {1-mcnemar_exact(results_table):.4f}\")\n",
    "    except:\n",
    "        print(\"FAILED\")\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Runs with 1000 samples"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "outputs": [],
   "source": [
    "datasets = {}\n",
    "\n",
    "with open(\"paper_files_large.txt\") as f:\n",
    "    for line in f:\n",
    "        if \"valuating_information_essentiality\" in line:\n",
    "            key = \"infoess\"\n",
    "        else:\n",
    "            key = line.split(\"@\")[0][len(\"results/\"):]\n",
    "        if key == \"multiarith2\": key = \"multiarith\"\n",
    "        decoder = line.split(\"-2023\")[0].split(\"-\")[-1]\n",
    "        if decoder == \"bsseq\":\n",
    "            decoder = \"beam_search\"\n",
    "        prompt = \"ao\" if \"@ao\" in line else (\"cot\" if \"@cot\" in line else \"multi\")\n",
    "        datasets.setdefault(key, {})[prompt + \"@\" + decoder] = line.strip()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Analysing matrixshapes\n",
      "Best prior method: cot@argmax with acc: 0.779; Best our method: multi@var with acc: 0.817\n",
      "Result statistically significant with confidence 0.9974\n"
     ]
    }
   ],
   "source": [
    "for key in datasets.keys():\n",
    "    print(f\"Analysing {key}\")\n",
    "    data_files = {k: v.split(\"/\")[-1][:-4] + \".json\" for k, v in datasets[key].items()}\n",
    "    task_results = {k: TaskResult(os.path.join(\"results\", r)) for k, r in data_files.items()}\n",
    "    _ = [x.data() for x in task_results.values()]\n",
    "    if \"GSM8\" in key:\n",
    "        t = value_normalizer\n",
    "    elif \"sqauni\":\n",
    "        t = lambda x: str(x)\n",
    "    else:\n",
    "        t = lambda x: x\n",
    "    accuracies = {}\n",
    "    for k, r in task_results.items():\n",
    "        task_results[k]._data[\"correct\"] =  r.data()[\"target\"].apply(t) == r.data()[\"prediction\"].apply(t)\n",
    "        accuracies[k] = np.array(task_results[k]._data[\"correct\"]).astype(float).mean()\n",
    "    accuracies_ours = sorted([(k, v) for k, v in accuracies.items() if k in keys_ours], key = lambda x: x[1], reverse=True)\n",
    "    accuracies_prior = sorted([(k, v) for k, v in accuracies.items() if k in keys_prior], key = lambda x: x[1], reverse=True)\n",
    "\n",
    "    best_ours_key = accuracies_ours[0][0]\n",
    "    if len(accuracies_prior) == 0:\n",
    "        print(f\"Best our method: {best_ours_key} with acc: {accuracies_ours[0][1]:.3f}\")\n",
    "        continue\n",
    "    best_prior_key = accuracies_prior[0][0]\n",
    "    print(f\"Best prior method: {best_prior_key} with acc: {accuracies[best_prior_key]:.3f}; Best our method: {best_ours_key} with acc: {accuracies[best_ours_key]:.3f}\")\n",
    "\n",
    "    try:\n",
    "        results_table = get_results_table([task_results[best_ours_key], task_results[best_prior_key]])\n",
    "        print(f\"Result statistically significant with confidence {1-mcnemar_exact(results_table):.4f}\")\n",
    "    except:\n",
    "        print(\"FAILED\")\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Debug"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "outputs": [],
   "source": [
    "key = \"GSM8\"\n",
    "data_files = {k: v.split(\"/\")[-1][:-4] + \".json\" for k, v in datasets[key].items()}\n",
    "task_results = {k: TaskResult(os.path.join(\"results\", r)) for k, r in data_files.items()}"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best prior method: cot@beam_search with acc: 0.58; Best our method: cot@beam_var with acc: 0.64\n"
     ]
    }
   ],
   "source": [
    "if \"GSM8\" in key:\n",
    "    t = value_normalizer\n",
    "else:\n",
    "    t = lambda x: x\n",
    "accuracies = {}\n",
    "for k, r in task_results.items():\n",
    "    task_results[k]._data[\"correct\"] =  r.data()[\"target\"].apply(t) == r.data()[\"prediction\"].apply(t)\n",
    "    accuracies[k] = np.array(task_results[k]._data[\"correct\"]).astype(float).mean()\n",
    "accuracies_ours = sorted([(k, v) for k, v in accuracies.items() if k in keys_ours], key = lambda x: x[1], reverse=True)\n",
    "accuracies_prior = sorted([(k, v) for k, v in accuracies.items() if k in keys_prior], key = lambda x: x[1], reverse=True)\n",
    "\n",
    "best_ours_key = accuracies_ours[0][0]\n",
    "best_prior_key = accuracies_prior[0][0]\n",
    "print(f\"Best prior method: {best_prior_key} with acc: {accuracies[best_prior_key]}; Best our method: {best_ours_key} with acc: {accuracies[best_ours_key]}\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "outputs": [],
   "source": [
    "a = set(task_results[best_ours_key].data().apply(lambda x: int(x['query_file'].split(\"/\")[-1].strip(\".lmql\")),axis=1))\n",
    "b = set(task_results[best_prior_key].data().apply(lambda x: int(x['query_file'].split(\"/\")[-1].strip(\".lmql\")),axis=1))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "outputs": [
    {
     "data": {
      "text/plain": "{51}"
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b-a"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "data_files = [\"AQUA@ao_medium-openai-text-davinci-003-argmax-2023:01:26_00:46:49.json\", \"AQUA@cot_medium-openai-text-davinci-003-argmax-2023:01:25_23:07:00.json\"]\n",
    "task_results = [TaskResult(os.path.join(\"results\", r)) for r in data_files]\n",
    "_ = [x.data() for x in task_results]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "results_correct = []\n",
    "target = None\n",
    "\n",
    "for results in task_results:\n",
    "    result = results.data()[[\"target\",\"prediction\",\"query_file\"]]\n",
    "    result = result.set_index(result.apply(lambda x: int(x['query_file'].split(\"/\")[-1].strip(\".lmql\")),axis=1)).drop(\"query_file\",axis=1).sort_index()\n",
    "    if target is not None:\n",
    "        assert all(result[\"target\"] == target)\n",
    "    target = result[\"target\"]\n",
    "    correct = result.apply(lambda x: x['target'] == x['prediction'],axis=1)\n",
    "    correct = np.array(correct)\n",
    "    results_correct.append(correct)\n",
    "\n",
    "results_correct = np.stack(results_correct, axis=0)\n",
    "results_table = np.array([[(~results_correct).all(0).sum(), (results_correct[0]>results_correct[1]).sum()], [(results_correct[0]<results_correct[1]).sum(), results_correct.all(0).sum()]])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.0\n",
      "nan\n",
      "0.5\n",
      "0.4875113036274319\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/3l/bhvwn3fx10x16r6xk6188lq40000gn/T/ipykernel_7831/3598788678.py:4: RuntimeWarning: invalid value encountered in scalar divide\n",
      "  z = np.abs((x01 - x10)/np.sqrt(nd))\n"
     ]
    }
   ],
   "source": [
    "scale = 40\n",
    "print(mcnemar_exact(results_table*scale))\n",
    "print(mcnemar_asymptotic(results_table*scale))\n",
    "print(binomial_asymptotic(results_table*scale))\n",
    "print(binomial_exact(results_table*scale))"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "vscode": {
   "interpreter": {
    "hash": "1a3f742538928d7fe17d54779274ecff8afc8007fdeca0464d7b2ce3865992ba"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
