{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preprocessing\n",
    "\n",
    "For every experiment, we will download all of the jobs/epochs and create a large dataframe. The sole purpose of this notebook is to download these results to our local drive.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import gspread\n",
    "from google.oauth2.service_account import Credentials"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Path to your service account JSON file\n",
    "SERVICE_ACCOUNT_FILE = \"../velvety-tube-450516-r5-2dfa430c056c.json\" \n",
    "\n",
    "# Define the scope (Google Sheets + Google Drive API)\n",
    "SCOPES = [\n",
    "    \"https://www.googleapis.com/auth/spreadsheets\",\n",
    "    \"https://www.googleapis.com/auth/drive.file\",  # Allows access to files the service account has access to\n",
    "    \"https://www.googleapis.com/auth/drive.readonly\"  # Read-only access to drive files\n",
    "]\n",
    "# Authenticate with Google\n",
    "creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)\n",
    "client = gspread.authorize(creds)\n",
    "\n",
    "# Get names of all sheets\n",
    "SPREADSHEET_NAME = \"Matheus_counterexamples\"\n",
    "spreadsheet = client.open(SPREADSHEET_NAME)\n",
    "sheet_names = [sheet.title for sheet in spreadsheet.worksheets()]\n",
    "name_to_idx = {}\n",
    "for i, name in enumerate(sheet_names):\n",
    "    name_to_idx[name] = i"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Table of experiments\n",
    "\n",
    "| counterexample | nbits | num_samples | num_experiments | (description)  | Done?     |\n",
    "|----------      |-------|----------   |----------       |----------|---------- |\n",
    "| 000110000      | 14    | 50000       | 20 (RNN,SAN)    |           |  **yes** |\n",
    "| 000110000      | 14  | 10000       | 20              |           |  **yes**  |\n",
    "|000110000        | 14  | 10000        | Data          | v2: three different (seed, n_val) combos to see if we get robust outcomes for higher-variance validation scores. However, we saw very divergent behavior, and realized that the training sets were generated with different seeds too! so, see v3.          |           |\n",
    "| 000110000      | 14  | 10000         | 20              | v3: Same as v2, but training set held fixed.          |  **yes**  |"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### early matheus experiments\n",
    "\n",
    "list of n_val, seed: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RNN_counterexample_v3_012345678_nbits14_ntr10000_nval30000_bf20_seed2469\n"
     ]
    }
   ],
   "source": [
    "### early matheus experiments\n",
    "\n",
    "def exp_template_0(model_type, num_samples):\n",
    "    counterexample_signature = '000110000'\n",
    "    nbits = 14\n",
    "    return f\"{model_type}_counterexample{counterexample_signature}_nbits{nbits}_n{num_samples}_bf20_seed1234\"\n",
    "\n",
    "\n",
    "def exp_template_1(model_type, n_val, seed):\n",
    "    \"\"\"template filename for counterexample_signature = '000110000'\"\"\"\n",
    "    n_train = 10000\n",
    "    nbits = 14\n",
    "    return f\"{model_type}_counterexample_v2_012345678_nbits{nbits}_ntr{n_train}_nval{n_val}_bf20_seed{seed}\"\n",
    "\n",
    "def exp_template_v3(model_type, n_val, seed):\n",
    "    \"\"\"template filename for counterexample_signature = '000110000'\"\"\"\n",
    "    n_train = 10000\n",
    "    nbits = 14\n",
    "    return f\"{model_type}_counterexample_v3_012345678_nbits{nbits}_ntr{n_train}_nval{n_val}_bf20_seed{seed}\"\n",
    "\n",
    "\n",
    "experiment = 6\n",
    "seed_nval_list = [(40000, 2484), (30000, 2469), (20000, 2488)]\n",
    "\n",
    "if experiment == 1:\n",
    "    nbits = 14\n",
    "    num_samples = 10000\n",
    "    model_type = \"RNN\"\n",
    "    experiment_name = exp_template_0(model_type, num_samples)\n",
    "    if model_type == \"RNN\":\n",
    "        model_range = range(10, 25) # RNN\n",
    "    else:\n",
    "        model_range = range(20) # SAN\n",
    "\n",
    "elif experiment == 2: # 5/25 experiments\n",
    "    model_type = \"SAN\" # \"RNN\"\n",
    "    experiment_name = exp_template_1(model_type, 30000, 2469)\n",
    "    model_range = list(range(19))\n",
    "elif experiment == 3: # 5/25 experiments\n",
    "    model_type = \"SAN\" # \"SAN\"\n",
    "    experiment_name = exp_template_1(model_type, 40000, 2484)\n",
    "    model_range = list(range(19))\n",
    "elif experiment == 4: # 5/25 experiments\n",
    "    model_type = \"SAN\" # \"SAN\"\n",
    "    experiment_name = exp_template_1(model_type, 20000, 2488)\n",
    "    model_range = list(range(19))\n",
    "elif experiment == 6: # 6/2 experiments\n",
    "    model_type = \"RNN\" # \"SAN\"\n",
    "    experiment_name = exp_template_v3(model_type, 30000, 2469)\n",
    "    model_range = list(range(19))    \n",
    "print(experiment_name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_list = []\n",
    "for model in model_range:\n",
    "    # ----- fetch data -----\n",
    "    target_name = experiment_name + f\"_model_{model}\"\n",
    "    idx         = name_to_idx[target_name]\n",
    "    ws          = spreadsheet.get_worksheet(idx)\n",
    "    df_temp     = pd.DataFrame(ws.get_all_records())\n",
    "    df_temp[\"model_num\"] = model\n",
    "\n",
    "    df_list.append(df_temp)\n",
    "\n",
    "# Concatenate all dataframes\n",
    "df = pd.concat(df_list, ignore_index=True)\n",
    "os.makedirs(\"counterexamples_processed\", exist_ok=True)\n",
    "df.to_csv(f\"counterexamples_processed/{experiment_name}.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "RNN_counterexample_v2_012345678_nbits14_ntr10000_nval40000_bf20_seed2484_model_6"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
