{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import numpy as np \n",
    "import pandas as pd \n",
    "from tqdm import tqdm \n",
    "\n",
    "from math500.math_utils import * \n",
    "from math500.parser import *\n",
    "from math500.grader import * \n",
    "\n",
    "from utils import * "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tasks = ['math500']\n",
    "shots = [\"few\"]\n",
    "models = ['gpt-4o-mini']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_value(likelihood_file, baseline_few_file, scored_file, selected_k): \n",
    "    if not os.path.exists(baseline_few_file):\n",
    "        print(f\"File not found: {baseline_few_file}\")\n",
    "        return None\n",
    "    else:\n",
    "        few_res = []\n",
    "        with open(baseline_few_file, 'r', encoding='utf-8') as f:\n",
    "            for i, line in enumerate(f, start=1):\n",
    "                try:\n",
    "                    few_res.append(json.loads(line))\n",
    "                except json.JSONDecodeError as e:\n",
    "                    print(baseline_few_file)\n",
    "                    print(f\"Error parsing line {i}: {e}\")\n",
    "                    print(line[:100])\n",
    "    \n",
    "    if not os.path.exists(likelihood_file):\n",
    "        print(f\"Error: {likelihood_file} not found.\")\n",
    "        return None\n",
    "\n",
    "    with open(likelihood_file, \"r\") as f:\n",
    "        likelihoods = json.load(f)\n",
    "\n",
    "    with open(scored_file, 'r', encoding='utf-8') as f:\n",
    "        scored_entries = [json.loads(line) for line in f]\n",
    "    \n",
    "    problem_groups = list(zip(*likelihoods))\n",
    "    \n",
    "    num_examples = len(few_res)           \n",
    "\n",
    "    aggregated_results = {\n",
    "        \"forward_score\": [[] for _ in range(num_examples)],\n",
    "        \"backward_score\": [[] for _ in range(num_examples)],\n",
    "        \"is_correct\": [[] for _ in range(num_examples)]\n",
    "    }\n",
    "\n",
    "    for i, (few, problem_likelihoods) in enumerate(zip(few_res, problem_groups)):\n",
    "        k_set = set(selected_k[i]) \n",
    "        problem_list = list(problem_likelihoods)\n",
    "\n",
    "        for cl in problem_list:\n",
    "            calculate_diffs(cl, k_set)\n",
    "\n",
    "        few_results = few['results']       \n",
    "    \n",
    "        for j, few_result in enumerate(few_results):\n",
    "            is_correct_val = scored_entries[i]['is_correct'][j]\n",
    "\n",
    "            ce_list = few_result[\"ce_losses\"]   \n",
    "            ce_mean = np.mean(ce_list)\n",
    "            \n",
    "            replace_ce_val = problem_list[j]['replace_ce_diff']\n",
    "\n",
    "            aggregated_results[\"forward_score\"][i].append(ce_mean)\n",
    "            aggregated_results[\"backward_score\"][i].append(replace_ce_val)\n",
    "            aggregated_results[\"is_correct\"][i].append(is_correct_val)\n",
    "\n",
    "    for key in aggregated_results:\n",
    "        aggregated_results[key] = np.array(aggregated_results[key])\n",
    "\n",
    "    return aggregated_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {}\n",
    "\n",
    "for task in tasks:\n",
    "    if task not in results:\n",
    "        results[task] = {}\n",
    "    for model in models:\n",
    "        likelihood_file = f\"likelihood/{task}/{model}/few/all_likelihoods.json\"\n",
    "        baseline_few_file = f\"baselines/baseline/{task}/{model}/{task}_few_few.jsonl\" \n",
    "        scored_file = f\"result/{task}/{model}/{task}_few_scored.jsonl\"\n",
    "\n",
    "        embedding_file_path = f\"embedding/{task}/{task}_k.jsonl\" \n",
    "        selected_k = load_topk_jsonl(embedding_file_path, 500)\n",
    "\n",
    "        result = get_value(likelihood_file, baseline_few_file, scored_file, selected_k)\n",
    "        \n",
    "        results[task][model] = result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_temps(check):\n",
    "    backward_score  = check['backward_score']\n",
    "    forward_score      = check['forward_score']\n",
    "    \n",
    "    referi = forward_score - backward_score\n",
    "\n",
    "    return {\n",
    "        'referi': referi,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "===== MODEL: gpt-4o-mini =====\n",
      ">>> BEST only:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>math500</th>\n",
       "      <th>Avg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>referi_BEST</th>\n",
       "      <td>77.8 (77.8)</td>\n",
       "      <td>77.8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 math500   Avg\n",
       "referi_BEST  77.8 (77.8)  77.8"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "methods = ['referi']\n",
    "\n",
    "ordered_benchmarks = [\n",
    "    \"math500\"\n",
    "]\n",
    "\n",
    "for model in models:\n",
    "    print(f\"===== MODEL: {model} =====\")\n",
    "\n",
    "    rows = [m + \"_BEST\" for m in methods]\n",
    "    df_   = pd.DataFrame(index=rows,\n",
    "                           columns=ordered_benchmarks + ['Avg'],\n",
    "                           dtype=object)\n",
    "\n",
    "    for bm in ordered_benchmarks:\n",
    "        res = results.get(bm, {}).get(model)\n",
    "        if res is None:\n",
    "            df_[bm] = np.nan\n",
    "            continue\n",
    "\n",
    "        is_corr = res['is_correct']\n",
    "        N       = is_corr.shape[0]\n",
    "\n",
    "        for m in methods:            \n",
    "            temps = compute_temps(res)\n",
    "            arr   = temps[m]\n",
    "            chosen = np.argmin(arr, axis=-1)\n",
    "    \n",
    "            score = 100 * is_corr[np.arange(N), chosen].mean()\n",
    "            df_.at[f\"{m}_BEST\", bm] = f\"{score:.1f} ({score:.1f})\"\n",
    "\n",
    "    def compute_avg(df):\n",
    "        for idx in df.index:\n",
    "            vals = []\n",
    "            for bm in ordered_benchmarks:\n",
    "                cell = df.at[idx, bm]\n",
    "                if pd.isna(cell): continue\n",
    "                try:\n",
    "                    vals.append(float(cell.split()[0]))\n",
    "                except:\n",
    "                    pass\n",
    "            df.at[idx, 'Avg'] = round(np.mean(vals), 1) if vals else np.nan\n",
    "\n",
    "    compute_avg(df_)\n",
    "\n",
    "    print(\">>> BEST only:\")\n",
    "    display(df_)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "proj2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
