{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9870cf02-64f7-4794-b9f1-c97e9a1a3ef4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "tools_path = os.path.join('..', 'tools')\n",
    "sys.path.append(tools_path)\n",
    "import file_tools\n",
    "\n",
    "import os\n",
    "_DF_DIR = os.path.join('.', 'df_files')\n",
    "_FINAL_DFS_DIR = os.path.join('.', 'final_dfs')\n",
    "file_tools.ensure_dir(_FINAL_DFS_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a778190d-b537-419a-b566-a6b77f3a30a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6f94cb6-ed43-4b6f-b9e2-b6ac22ed6e08",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = {}\n",
    "for f in file_tools.list_files(_DF_DIR, '*.parquet'):\n",
    "    dbf = os.path.join(_DF_DIR, f)\n",
    "    df = pd.read_parquet(dbf, engine='pyarrow')\n",
    "    basename = file_tools.get_filebasename(f)\n",
    "    dfs[basename] = df\n",
    "    print(basename, len(df), df['method_name'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c0b546e-5f7f-4f4a-9d13-4d5c94475fbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# some local regressors runs were test run incomplete to prepare for cluster runs\n",
    "mask_results = ((dfs['results']['method_name'] != \"SVR\") &\n",
    "        (dfs['results']['method_name'] != \"MLP\") &\n",
    "        (dfs['results']['method_name'] != \"RandomForest\"))\n",
    "\n",
    "result_df = dfs['results'][mask_results]\n",
    "print(result_df['method_name'].unique())\n",
    "\n",
    "# LinReg on cluster is a duplicate, we could keep cluster one or local run one\n",
    "# we remove cluste one\n",
    "mask_results_cluster = ((dfs['results_cluster']['method_name'] != \"LinearRegression\"))\n",
    "\n",
    "results_cluster_df = dfs['results_cluster'][mask_results_cluster]\n",
    "print(results_cluster_df['method_name'].unique())\n",
    "\n",
    "results_best_cluster_df = dfs['results_best_cluster']\n",
    "\n",
    "\n",
    "## SVR EEGNet\n",
    "mask_results_bestsvreegnet = ((dfs['results_bestsvreegnet']['method_name'] == \"BestSVR\"))\n",
    "results_bestsvreegnet_df = dfs['results_bestsvreegnet'][mask_results_bestsvreegnet]\n",
    "print(results_bestsvreegnet_df['method_name'].unique())\n",
    "\n",
    "\n",
    "final_result_df = pd.concat([result_df, results_cluster_df, results_best_cluster_df, results_bestsvreegnet_df], axis=0)\n",
    "print(final_result_df['method_name'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58826f1c-4647-4a2e-8e96-61845a6cb765",
   "metadata": {},
   "outputs": [],
   "source": [
    "results_filename = os.path.join(_FINAL_DFS_DIR, 'results.parquet')\n",
    "final_result_df.to_parquet(results_filename, engine='pyarrow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9259cc25-a858-4ae3-99b3-a21400e55e45",
   "metadata": {},
   "outputs": [],
   "source": [
    "results_ablation_filename = os.path.join(_FINAL_DFS_DIR, 'results_ablation.parquet')\n",
    "dfs['results_ablation'].to_parquet(results_ablation_filename, engine='pyarrow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c58fd8e3-01a2-40d9-80ed-6c3b15486716",
   "metadata": {},
   "outputs": [],
   "source": [
    "results_dimensionality_eeg_filename = os.path.join(_FINAL_DFS_DIR, 'results_dimensionality_eeg.parquet')\n",
    "dfs['results_dimensionality_eeg'].to_parquet(results_dimensionality_eeg_filename, engine='pyarrow')\n",
    "\n",
    "results_dimensionality_face_filename = os.path.join(_FINAL_DFS_DIR, 'results_dimensionality_face.parquet')\n",
    "dfs['results_dimensionality_face'].to_parquet(results_dimensionality_face_filename, engine='pyarrow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19c56ad9-23c7-481e-ba50-0468854dcc20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for optim, we cut it all to 1000 iterations\n",
    "CUT_THRESHOLD = 999\n",
    "\n",
    "df_optim = dfs['optim']\n",
    "\n",
    "mask = ((df_optim['number'] <= CUT_THRESHOLD))\n",
    "df_optim = df_optim[mask]\n",
    "\n",
    "from collections import Counter\n",
    "element_counts = Counter(df_optim['number'])\n",
    "counts = list(element_counts.values())\n",
    "assert len(counts) == 1000\n",
    "assert all(count == counts[0] for count in counts)\n",
    "\n",
    "optim_filename = os.path.join(_FINAL_DFS_DIR, 'optim.parquet')\n",
    "df_optim.to_parquet(optim_filename, engine='pyarrow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1413d1b0-2610-4876-b7c6-3f68f4e00dcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# optim_ablation is a a different beast as we will pick the 0 ablation from the optim results above\n",
    "# and cut it all again to 1000 iteration\n",
    "\n",
    "CUT_THRESHOLD = 999\n",
    "\n",
    "df_optim_ablation = dfs['optim_ablation']\n",
    "\n",
    "# trim to 1000 iterations\n",
    "mask = ((df_optim_ablation['number'] <= CUT_THRESHOLD))\n",
    "df_optim_ablation = df_optim_ablation[mask]\n",
    "\n",
    "df_optim_ablation['method_name']\n",
    "\n",
    "# remove all 0 runs \n",
    "mask = ((df_optim_ablation['ablation_distance'] != 0))\n",
    "df_optim_ablation = df_optim_ablation[mask]\n",
    "# replace them with df_optim which are the same for both ablation and control run_type\n",
    "df_optim_copy = df_optim.copy()\n",
    "df_optim_copy['ablation_distance'] = 0\n",
    "df_optim_copy['run_type'] = 'control'\n",
    "df_optim_ablation = pd.concat([df_optim_ablation, df_optim_copy], ignore_index=True)\n",
    "\n",
    "df_optim_copy = df_optim.copy()\n",
    "df_optim_copy['ablation_distance'] = 0\n",
    "df_optim_copy['run_type'] = 'ablation'\n",
    "df_optim_ablation = pd.concat([df_optim_ablation, df_optim_copy], ignore_index=True)\n",
    "\n",
    "# mask = ((df_optim_ablation['ablation_distance'] != 0) &\n",
    "#         (df['method_name'] != \"BestMLP\") &\n",
    "#         (df['eeg_name'] == \"EEG_Raw\") & \n",
    "#         (df['test_name'] == \"random\"))\n",
    "\n",
    "\n",
    "# sanity check\n",
    "from collections import Counter\n",
    "element_counts = Counter(df_optim_ablation['number'])\n",
    "counts = list(element_counts.values())\n",
    "assert len(counts) == 1000\n",
    "assert all(count == counts[0] for count in counts)\n",
    "\n",
    "optim_ablation_filename = os.path.join(_FINAL_DFS_DIR, 'optim_ablation.parquet')\n",
    "df_optim_ablation.to_parquet(optim_ablation_filename, engine='pyarrow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4be24c1e-1d7f-420d-8da6-be95156ece0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_optim_ablation['ablation_distance'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e14b5e46-0915-4e88-b935-70e6a39fa0b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_optim_ablation['run_type'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c93186e5-8825-4f26-94f7-ea00edc52baf",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_optim_ablation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d0506aa-af65-49f0-bff6-12ffa181cae7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# grouped_df = dfs['results_best_again'].groupby(['method_name', 'eeg_name'])\n",
    "# grouped_df = dfs['results_clustersvr'].groupby(['method_name', 'eeg_name'])\n",
    "grouped_df = dfs['results_best_cluster'].groupby(['method_name', 'eeg_name'])\n",
    "\n",
    "\n",
    "# Calculate mean and std for each subtable\n",
    "compiled_result = grouped_df.agg({\n",
    "    'pearsonr_statistic': ['mean', 'std', 'sem']\n",
    "})\n",
    "\n",
    "# Flatten the column names\n",
    "compiled_result.columns = ['_'.join(col).strip() for col in compiled_result.columns.values]\n",
    "\n",
    "# Reset the index to display method_name and training_size as columns\n",
    "compiled_result = compiled_result.reset_index()\n",
    "compiled_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9411e7c0-9649-4a38-9c49-c5a8cb65a84e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
