{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "341a7bab-7048-4489-ac2b-5b0de88f8dd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "# sample_idx, exp, test_id, fluency, relevance, logical_soundness, normal\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aae6b386-7445-472f-ace4-6823aac9fcea",
   "metadata": {},
   "outputs": [],
   "source": [
    "RESULT_DIR = Path(\"/data/vtt/human_evaluation\")\n",
    "BAD_SAMPLE_IDX = [1336, 695, 23, 929]\n",
    "\n",
    "files = RESULT_DIR.glob(\"*.json\")\n",
    "results = []\n",
    "for file in files:\n",
    "    with open(file, \"r\") as f:\n",
    "        data = json.load(f)\n",
    "    if data[\"sample_idx\"] in BAD_SAMPLE_IDX:\n",
    "        continue\n",
    "    try:\n",
    "        results.append({\n",
    "            \"sample_idx\": data[\"sample_idx\"],\n",
    "            \"exp\": data[\"exp\"],\n",
    "            \"test_id\": int(file.stem.split(\"_\")[-1]),\n",
    "            \"fluency\": int(data[\"fluency\"]),\n",
    "            \"relevance\": int(data[\"relevance\"]),\n",
    "            \"logical_soundness\": int(data[\"logical_soundness\"]),\n",
    "            \"normal\": data[\"normal\"]\n",
    "        })\n",
    "    except:\n",
    "        print(file)\n",
    "df = pd.DataFrame(results)\n",
    "df = df.sort_values(by=[\"sample_idx\", \"exp\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86aa4915-23ff-4b5c-8fce-d92e686d329c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c0d6a07-0607-45d4-8771-c19edd02d658",
   "metadata": {},
   "outputs": [],
   "source": [
    "exps = df.exp.unique()\n",
    "results_dict = {}\n",
    "for exp in exps:\n",
    "    results_dict[exp] = df[df.exp==exp]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1500c133-f4c4-492e-93ea-ca0c6469b1a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "stats.ttest_ind(results_dict['ttnet_base'].logical_soundness, results_dict['ttnet'].logical_soundness)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fcb1c4f-f985-460a-9247-fafdef5d5ef0",
   "metadata": {},
   "outputs": [],
   "source": [
    "results_table = []\n",
    "df_ttnet = df[df.exp==\"ttnet\"]\n",
    "def significant(df_exp, key):\n",
    "    _, p_value = stats.ttest_ind(df_ttnet[key], df_exp[key])\n",
    "    return \"$\\dagger$\" if p_value < 0.05 else \"\"\n",
    "for exp in exps:\n",
    "    df_exp = df[df.exp==exp]\n",
    "    mean = df_exp.mean()\n",
    "    std = df_exp.std()\n",
    "    results_table.append({\n",
    "        \"exp\": exp,\n",
    "        \"fluency\": f\"{mean['fluency']:.2f}{significant(df_exp, 'fluency')}\",\n",
    "        \"relevance\": f\"{mean['relevance']:.2f}{significant(df_exp, 'relevance')}\",\n",
    "        \"LS\": f\"{mean['logical_soundness']:.2f}{significant(df_exp, 'logical_soundness')}\"\n",
    "        # \"fluency\": f\"{mean['fluency']:.2f} {std['fluency']:.2f}\",\n",
    "        # \"relevance\": f\"{mean['relevance']:.2f} {std['relevance']:.2f}\",\n",
    "        # \"LS\": f\"{mean['logical_soundness']:.2f} {std['logical_soundness']:.2f}\"\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b94c5361-5eb0-405d-8104-3f1b5fd1e4b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pd.DataFrame(results_table).style.to_latex())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17d6039b-7cfe-4372-be27-5f764edcede7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df.exp==\"ttnet\"].var()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5dd95a7e-f45c-4a84-b33a-0b8a5a2b1e81",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df.exp==\"ttnet_base\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b634360-8a6c-4dc1-ab6c-a07b668095eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df.exp==\"glacnet\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44e01553-d46e-4bd6-8a00-c2409d290db8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df.exp==\"densecap\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d028c3ad-a998-485b-ab89-9a5328280ae4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df.exp==\"cst\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9c768de-95dc-4d30-a1a0-0d48f932b6db",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
