{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b6284c0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e62fa581",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ed670494",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "8707ea6d",
   "metadata": {},
   "source": [
    "# Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8d3fcde5",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = \"data.05_21_16_53\"\n",
    "\n",
    "\n",
    "def read_json_lines_to_df(file_path):\n",
    "    df = pd.read_json(open(file_path, \"r\", encoding=\"utf8\"), lines=True)\n",
    "    return df\n",
    "\n",
    "\n",
    "def get_sample_df(data_path=data_path, task=\"text_summarization\"):\n",
    "    df1 = read_json_lines_to_df(f\"{data_path}/{task}.txt\")\n",
    "    df2 = read_json_lines_to_df(f\"{data_path}/{task}_result.txt\")\n",
    "    df3 = read_json_lines_to_df(f\"{data_path}/{task}_ppl.txt\")\n",
    "    df4 = read_json_lines_to_df(f\"{data_path}/{task}_score.txt\")\n",
    "    # merge by id, watermark_process column\n",
    "    df = pd.merge(df1, df2, on=[\"id\", \"watermark_processor\"])\n",
    "    df = pd.merge(df, df3, on=[\"id\", \"watermark_processor\"])\n",
    "    # df4 misses some rows, so we use left join\n",
    "    df = pd.merge(df, df4, on=[\"id\", \"watermark_processor\"], how=\"left\")\n",
    "    return df\n",
    "\n",
    "\n",
    "def get_bootstrap_df(data_path=data_path, task=\"machine_translation\"):\n",
    "    df1 = read_json_lines_to_df(f\"{data_path}/{task}_bleu.txt\")\n",
    "    return df1\n",
    "\n",
    "\n",
    "def extract_watermark_info(df, return_wp_list=False):\n",
    "    show_wp = [\n",
    "        \"No Watermark\",\n",
    "        \"$\\delta$-reweight\",\n",
    "        \"$\\gamma$-reweight\",\n",
    "        \"$\\delta$-reweight (woh)\",\n",
    "        \"$\\gamma$-reweight (woh)\",\n",
    "    ]\n",
    "    john_wps_set = set()\n",
    "\n",
    "    def map_wp_str(wp_str):\n",
    "        if \"Delta\" in wp_str or \"Gamma\" in wp_str:\n",
    "            woh = \", True)\" in wp_str\n",
    "            if \"Delta\" in wp_str and not woh:\n",
    "                return show_wp[1]\n",
    "            elif \"Delta\" in wp_str and woh:\n",
    "                return show_wp[3]\n",
    "            elif \"Gamma\" in wp_str and not woh:\n",
    "                return show_wp[2]\n",
    "            elif \"Gamma\" in wp_str and woh:\n",
    "                return show_wp[4]\n",
    "        elif \"John\" in wp_str:\n",
    "            import re\n",
    "\n",
    "            delta = re.findall(r\"delta=(\\d+\\.?\\d*)\", wp_str)[0]\n",
    "            n = \"Soft\" + f\"($\\delta$={delta})\"\n",
    "            john_wps_set.add(n)\n",
    "            return n\n",
    "        if wp_str == \"None\":\n",
    "            return show_wp[0]\n",
    "        else:\n",
    "            raise ValueError(\"Unknown watermark: {}\".format(wp_str))\n",
    "\n",
    "    df = df.assign(show_wp_name=df[\"watermark_processor\"].apply(map_wp_str))\n",
    "    john_wps = sorted(list(john_wps_set))\n",
    "    show_wp = show_wp + john_wps\n",
    "    if return_wp_list:\n",
    "        return df, show_wp\n",
    "    else:\n",
    "        return df\n",
    "\n",
    "\n",
    "def sample_df_2_stat(df, bootstrap=False, show_wp=None):\n",
    "    sdf = df.melt(\n",
    "        id_vars=[\"show_wp_name\"],\n",
    "        value_vars=[c for c in df.columns if df[c].dtype == np.float64],\n",
    "        var_name=\"score\",\n",
    "        value_name=\"value\",\n",
    "    )\n",
    "    sdf = sdf.groupby([\"show_wp_name\", \"score\"]).agg([\"mean\", \"std\", \"count\"])\n",
    "\n",
    "    def format_fn(x):\n",
    "        mean = x[\"mean\"]\n",
    "        if not bootstrap:\n",
    "            std = x[\"std\"] / np.sqrt(x[\"count\"])\n",
    "        else:\n",
    "            std = x[\"std\"]\n",
    "        if not np.isfinite(std):\n",
    "            return f\"{mean:.2f}±{std:.2f}\"\n",
    "        useful_digits = np.max(-int(np.floor(np.log10(std / 3))), 0)\n",
    "        fmt_str = f\"{{:.{useful_digits}f}}±{{:.{useful_digits}f}}\"\n",
    "        return fmt_str.format(mean, std)\n",
    "\n",
    "    sdf = sdf[\"value\"].apply(format_fn, axis=1).unstack()\n",
    "    if show_wp:\n",
    "        sdf = sdf.loc[show_wp]\n",
    "    return sdf\n",
    "\n",
    "\n",
    "def merge_stat_df(df1, df2):\n",
    "    df = pd.merge(df1, df2, left_index=True, right_index=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2ce5adad",
   "metadata": {},
   "outputs": [],
   "source": [
    "tsdf, show_wp = extract_watermark_info(get_sample_df(), return_wp_list=True)\n",
    "mtdf = extract_watermark_info(get_sample_df(task=\"machine_translation\"))\n",
    "mtbsdf = extract_watermark_info(get_bootstrap_df())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "4df5587c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_len_info(df):\n",
    "    df = df.assign(\n",
    "        output_words=df[\"display_output\"].apply(lambda x: len(x.split(\" \"))),\n",
    "        output_chars=df[\"display_output\"].apply(lambda x: len(x)),\n",
    "    )\n",
    "    return df\n",
    "\n",
    "\n",
    "def filter_wh_score(df):\n",
    "    df = df[df[\"best_sum_score\"].notna()]\n",
    "    df = df[~df.show_wp_name.str.contains(\"woh\")]\n",
    "    return df\n",
    "\n",
    "def filter_score(df):\n",
    "    df = df[df[\"best_sum_score\"].notna()]\n",
    "    return df\n",
    "\n",
    "def filter_noout(df):\n",
    "    df = df[df['display_output']!='']\n",
    "    return df\n",
    "\n",
    "def merge_tasks(dfs: dict):\n",
    "    # add new column called task, and use key as task name\n",
    "    for k, v in dfs.items():\n",
    "        v[\"task\"] = k\n",
    "    df = pd.concat(dfs.values())\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9e0cf1af",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df = merge_tasks(\n",
    "    {\n",
    "        \"Text summarization\": extract_len_info(filter_score(tsdf)),\n",
    "        \"Machine translation\": extract_len_info(filter_score(mtdf)),\n",
    "    }\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "918e4514",
   "metadata": {},
   "source": [
    "verify woh has larger score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ae790ceb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>variable</th>\n",
       "      <th>best_score</th>\n",
       "      <th>entropy</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>task</th>\n",
       "      <th>show_wp_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">Text summarization</th>\n",
       "      <th>$\\delta$-reweight</th>\n",
       "      <td>0.8784±0.0015</td>\n",
       "      <td>0.9694±0.0009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>$\\delta$-reweight (woh)</th>\n",
       "      <td>0.9340±0.0015</td>\n",
       "      <td>0.9721±0.0009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>$\\gamma$-reweight</th>\n",
       "      <td>0.2207±0.0004</td>\n",
       "      <td>0.9695±0.0009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>$\\gamma$-reweight (woh)</th>\n",
       "      <td>0.2408±0.0004</td>\n",
       "      <td>0.9677±0.0009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">Machine translation</th>\n",
       "      <th>$\\delta$-reweight</th>\n",
       "      <td>0.4192±0.0043</td>\n",
       "      <td>0.5260±0.0024</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>$\\delta$-reweight (woh)</th>\n",
       "      <td>0.4517±0.0043</td>\n",
       "      <td>0.5249±0.0024</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>$\\gamma$-reweight</th>\n",
       "      <td>0.1056±0.0011</td>\n",
       "      <td>0.5271±0.0024</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>$\\gamma$-reweight (woh)</th>\n",
       "      <td>0.1192±0.0011</td>\n",
       "      <td>0.5281±0.0024</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "variable                                        best_score        entropy\n",
       "task                show_wp_name                                         \n",
       "Text summarization  $\\delta$-reweight        0.8784±0.0015  0.9694±0.0009\n",
       "                    $\\delta$-reweight (woh)  0.9340±0.0015  0.9721±0.0009\n",
       "                    $\\gamma$-reweight        0.2207±0.0004  0.9695±0.0009\n",
       "                    $\\gamma$-reweight (woh)  0.2408±0.0004  0.9677±0.0009\n",
       "Machine translation $\\delta$-reweight        0.4192±0.0043  0.5260±0.0024\n",
       "                    $\\delta$-reweight (woh)  0.4517±0.0043  0.5249±0.0024\n",
       "                    $\\gamma$-reweight        0.1056±0.0011  0.5271±0.0024\n",
       "                    $\\gamma$-reweight (woh)  0.1192±0.0011  0.5281±0.0024"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df[[\"task\", \"show_wp_name\", \"best_score\", \"entropy\"]].explode(\n",
    "    [\"best_score\", \"entropy\"]\n",
    ").astype({\"best_score\": float, \"entropy\": float}).melt(\n",
    "    id_vars=[\"task\", \"show_wp_name\"], value_vars=[\"best_score\", \"entropy\"]\n",
    ").groupby([\"task\", \"show_wp_name\", \"variable\"]).agg(\n",
    "    [\"mean\", \"std\", \"count\"]\n",
    ")['value'].apply(\n",
    "    lambda x: f\"{x['mean']:.4f}±{x['std']/np.sqrt(x['count']):.4f}\", axis=1\n",
    ").unstack().loc[[\"Text summarization\", \"Machine translation\"]]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e85b6b88",
   "metadata": {},
   "source": [
    "# additional performance table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "cc5952bc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{lllll}\n",
      "\\toprule\n",
      "score & bertscore.precision & bertscore.recall &         rouge2 &         rougeL \\\\\n",
      "show\\_wp\\_name            &                     &                  &                &                \\\\\n",
      "\\midrule\n",
      "No Watermark            &       0.3180±0.0009 &    0.3361±0.0010 &  0.1388±0.0008 &  0.2445±0.0008 \\\\\n",
      "\\$\\textbackslash delta\\$-reweight       &       0.3180±0.0009 &    0.3365±0.0010 &  0.1392±0.0008 &  0.2451±0.0008 \\\\\n",
      "\\$\\textbackslash gamma\\$-reweight       &       0.3180±0.0009 &    0.3360±0.0010 &  0.1397±0.0008 &  0.2451±0.0008 \\\\\n",
      "\\$\\textbackslash delta\\$-reweight (woh) &       0.3185±0.0009 &    0.3370±0.0010 &  0.1398±0.0008 &  0.2455±0.0008 \\\\\n",
      "\\$\\textbackslash gamma\\$-reweight (woh) &       0.3178±0.0009 &    0.3361±0.0010 &  0.1393±0.0008 &  0.2447±0.0008 \\\\\n",
      "Soft(\\$\\textbackslash delta\\$=0.0)      &       0.3180±0.0009 &    0.3361±0.0010 &  0.1388±0.0008 &  0.2445±0.0008 \\\\\n",
      "Soft(\\$\\textbackslash delta\\$=1.0)      &       0.3092±0.0009 &    0.3382±0.0009 &  0.1344±0.0007 &  0.2400±0.0007 \\\\\n",
      "Soft(\\$\\textbackslash delta\\$=2.0)      &       0.2908±0.0008 &    0.3339±0.0009 &  0.1238±0.0007 &  0.2293±0.0007 \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3513664/3198015382.py:1: FutureWarning: In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.\n",
      "  print(sample_df_2_stat(tsdf[['show_wp_name','bertscore.precision','bertscore.recall','rouge2','rougeL']], show_wp=show_wp).to_latex())\n"
     ]
    }
   ],
   "source": [
    "print(sample_df_2_stat(tsdf[['show_wp_name','bertscore.precision','bertscore.recall','rouge2','rougeL']], show_wp=show_wp).to_latex())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "3cb0c6c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{llll}\n",
      "\\toprule\n",
      "score & bertscore.precision & bertscore.recall &          ppl \\\\\n",
      "show\\_wp\\_name            &                     &                  &              \\\\\n",
      "\\midrule\n",
      "No Watermark            &         0.546±0.003 &      0.575±0.003 &    2.31±0.07 \\\\\n",
      "\\$\\textbackslash delta\\$-reweight       &         0.550±0.003 &      0.579±0.003 &    2.20±0.05 \\\\\n",
      "\\$\\textbackslash gamma\\$-reweight       &         0.549±0.003 &      0.577±0.003 &    2.24±0.04 \\\\\n",
      "\\$\\textbackslash delta\\$-reweight (woh) &         0.555±0.003 &      0.583±0.003 &  2.114±0.020 \\\\\n",
      "\\$\\textbackslash gamma\\$-reweight (woh) &         0.549±0.003 &      0.577±0.003 &    2.24±0.04 \\\\\n",
      "Soft(\\$\\textbackslash delta\\$=0.0)      &         0.546±0.003 &      0.575±0.003 &    2.31±0.07 \\\\\n",
      "Soft(\\$\\textbackslash delta\\$=1.0)      &         0.537±0.003 &      0.568±0.003 &    2.43±0.07 \\\\\n",
      "Soft(\\$\\textbackslash delta\\$=2.0)      &         0.523±0.003 &      0.555±0.003 &    2.81±0.07 \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3513664/3115302146.py:1: FutureWarning: In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.\n",
      "  print(sample_df_2_stat(filter_noout(mtdf)[['show_wp_name','bertscore.precision','bertscore.recall','ppl']], show_wp=show_wp).to_latex())\n"
     ]
    }
   ],
   "source": [
    "print(sample_df_2_stat(filter_noout(mtdf)[['show_wp_name','bertscore.precision','bertscore.recall','ppl']], show_wp=show_wp).to_latex())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e490d27",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "4292d1429c89080e4a98e8c5b192a0c44fbb5e481943ce812f923de47b1f7601"
  },
  "kernelspec": {
   "display_name": "env: (machinelearning)",
   "language": "python",
   "name": "machinelearning"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
