{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 302,
   "id": "ab473954",
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import h5py"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "890a5139",
   "metadata": {},
   "source": [
    "# get number of movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "ec5c2f20",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_root = \"/storage/datasets/neuroscience/ecog/data-by-subject/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "d8f4b0b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ls /storage/datasets/neuroscience/ecog/data-by-subject/sub3/data/trials/trial000/metadata.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "4bc36c69",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subjs = glob.glob(os.path.join(data_root,\"*\"))\n",
    "subjs = [os.path.basename(os.path.normpath(s)) for s in subjs]\n",
    "len(subjs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "id": "df2db1f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no metadata for sub1 trial003\n"
     ]
    }
   ],
   "source": [
    "def get_metadata(trial_metadata_path):\n",
    "    try:\n",
    "        with open(trial_metadata) as f:\n",
    "            metadata = json.load(f)\n",
    "        stim_type = metadata[\"stimulus-type\"]\n",
    "        title = metadata['title']\n",
    "        return stim_type, title\n",
    "    except:\n",
    "        print(f\"no metadata for {subj} {trial}\")\n",
    "        return None, None\n",
    "  \n",
    "def get_duration(trial_trigger_path):\n",
    "    try:\n",
    "        trig_df = pd.read_csv(trial_trigger_path)\n",
    "        if not \"beginning\" in set(trig_df.type):\n",
    "            print(\"no beginning\")\n",
    "        if not \"end\" in set(trig_df.type):\n",
    "            print(\"no end\")\n",
    "        start = trig_df[trig_df.type==\"beginning\"].iloc[0][\"start_time\"].item()\n",
    "        end = trig_df[trig_df.type==\"end\"].iloc[-1][\"start_time\"].item()\n",
    "        duration = (end - start)/60/60\n",
    "        return duration\n",
    "    except:\n",
    "        print(f\"trouble finding duration for {subj} {trial}\")\n",
    "        return None\n",
    "    \n",
    "stim_types, titles, subj_list, trial_list, durations = [], [], [], [], []\n",
    "for subj in subjs:\n",
    "    subj_root = os.path.join(data_root, subj)\n",
    "    for trial_root in glob.glob(os.path.join(subj_root, \"data/trials/*\")):\n",
    "        trial = os.path.basename(os.path.normpath(trial_root))\n",
    "        trial_metadata = os.path.join(trial_root,\"metadata.json\")\n",
    "        stim_type, title = get_metadata(trial_metadata)\n",
    "        trial_triggers_path = os.path.join(trial_root,\"trigger-times.csv\")\n",
    "        \n",
    "        if stim_type is not None and title is not None:\n",
    "            stim_types.append(stim_type)\n",
    "            titles.append(title)\n",
    "            subj_list.append(subj)\n",
    "            trial_list.append(trial)\n",
    "            duration = get_duration(trial_triggers_path)\n",
    "            durations.append(duration)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9283bc5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "id": "1af95704",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'stimulus-type': 'movie',\n",
       " 'title': 'Fantastic Mr. Fox',\n",
       " 'filename': 'fantastic-mr-fox',\n",
       " 'language': 'en'}"
      ]
     },
     "execution_count": 255,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "id": "b9ca6771",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 256,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "glob.glob(os.path.join(data_root, \"data/trials/*\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "id": "05b9ac28",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/storage/datasets/neuroscience/ecog/data-by-subject/data/trials/*'"
      ]
     },
     "execution_count": 257,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.path.join(data_root, \"data/trials/*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "id": "6209070e",
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_df = pd.DataFrame({\"type\":stim_types, \"title\":titles, \"subj\": subj_list, \"trial\": trial_list, \"duration\": durations})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4acc30f1",
   "metadata": {},
   "source": [
    "exclude the spanish version of wreck it ralph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "id": "83bd6a8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_df = movies_df[~((movies_df.trial == \"trial003\") & (movies_df.subj==\"sub6\"))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "id": "eb2dc9f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# movies_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "id": "c11c0665",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "      <th>title</th>\n",
       "      <th>subj</th>\n",
       "      <th>trial</th>\n",
       "      <th>duration</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>movie</td>\n",
       "      <td>Lord Of The Rings 2</td>\n",
       "      <td>sub3</td>\n",
       "      <td>trial002</td>\n",
       "      <td>3.851451</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>movie</td>\n",
       "      <td>Lord Of The Rings 1</td>\n",
       "      <td>sub3</td>\n",
       "      <td>trial001</td>\n",
       "      <td>2.536244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>movie</td>\n",
       "      <td>Cars 2</td>\n",
       "      <td>sub3</td>\n",
       "      <td>trial000</td>\n",
       "      <td>1.744050</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>movie</td>\n",
       "      <td>Antman</td>\n",
       "      <td>sub9</td>\n",
       "      <td>trial000</td>\n",
       "      <td>1.829651</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>movie</td>\n",
       "      <td>Sesame Street Episode 3990</td>\n",
       "      <td>sub8</td>\n",
       "      <td>trial000</td>\n",
       "      <td>1.058720</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    type                       title    subj     trial  duration\n",
       "0  movie         Lord Of The Rings 2  sub3  trial002  3.851451\n",
       "1  movie         Lord Of The Rings 1  sub3  trial001  2.536244\n",
       "2  movie                      Cars 2  sub3  trial000  1.744050\n",
       "3  movie                      Antman  sub9  trial000  1.829651\n",
       "4  movie  Sesame Street Episode 3990  sub8  trial000  1.058720"
      ]
     },
     "execution_count": 261,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "id": "2d7c997a",
   "metadata": {},
   "outputs": [],
   "source": [
    "counts = np.unique(movies_df.subj, return_counts=True)[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "id": "9da44109",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2.7, 1.7349351572897473)"
      ]
     },
     "execution_count": 263,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(counts), np.std(counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "id": "be92e174",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(21,\n",
       " {'Antman',\n",
       "  'Aquaman',\n",
       "  'Avengers Infinity War',\n",
       "  'Black Panther',\n",
       "  'Cars 2',\n",
       "  'Coraline',\n",
       "  'Fantastic Mr. Fox',\n",
       "  'Guardians Of The Galaxy 2',\n",
       "  'Guardians Of the Galaxy',\n",
       "  'Lord Of The Rings 1',\n",
       "  'Lord Of The Rings 2',\n",
       "  'Megamind',\n",
       "  'Sesame Street Episode 3990',\n",
       "  'Shrek The Third',\n",
       "  'Spiderman Far From Home',\n",
       "  'Spiderman Homecoming',\n",
       "  'The Incredibles',\n",
       "  'The Martian',\n",
       "  'Thor Ragnarok',\n",
       "  'Toy Story',\n",
       "  'venom'})"
      ]
     },
     "execution_count": 264,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(movies_df.title)), set(movies_df.title)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2280fb3",
   "metadata": {},
   "source": [
    "# get movie durations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "id": "5fe3fa74",
   "metadata": {},
   "outputs": [],
   "source": [
    "trig_df = pd.read_csv(\"/storage/datasets/neuroscience/ecog/data-by-subject/sub5/data/trials/trial000/trigger-times.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "id": "9adb2897",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = trig_df[trig_df.type==\"beginning\"].iloc[0][\"start_time\"].item()\n",
    "end = trig_df[trig_df.type==\"end\"].iloc[-1][\"start_time\"].item()\n",
    "duration = (end - start)/60/60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "id": "f5642f4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# trig_df[trig_df.type==\"beginning\"].iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 269,
   "id": "46870ab7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "51.16058103463786"
      ]
     },
     "execution_count": 269,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(movies_df.duration)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "id": "c620a146",
   "metadata": {},
   "outputs": [],
   "source": [
    "subj_durations = movies_df.groupby('subj').sum().duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "id": "122c9555",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5.116058103463787, 4.619623867265097)"
      ]
     },
     "execution_count": 278,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subj_durations.mean(), subj_durations.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 279,
   "id": "f66f91b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# subj_durations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4bf72c54",
   "metadata": {},
   "source": [
    "# get electrode counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 295,
   "id": "01004541",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"/storage/datasets/neuroscience/ecog/data-by-subject/sub3/data/electrode_labels.txt\", \"r\") as f:\n",
    "    lines = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 296,
   "id": "4f32e6ee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['F2Ia\\n',\n",
       " 'F3aOF\\n',\n",
       " 'F3b\\n',\n",
       " 'F3c\\n',\n",
       " 'F3d\\n',\n",
       " 'P2a\\n',\n",
       " 'P2b\\n',\n",
       " 'T1aIc\\n',\n",
       " 'T1b\\n',\n",
       " 'T1cIe\\n',\n",
       " 'O1aIb\\n',\n",
       " 'O1bId\\n']"
      ]
     },
     "execution_count": 296,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "id": "6adc8043",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "depth-wm.csv            \u001b[0m\u001b[01;35mdural_ShiftDist.jpg\u001b[0m  raw_ShiftDist.fig\r\n",
      "depth-wm_ShiftDist.fig  pial.csv             \u001b[01;35mraw_ShiftDist.jpg\u001b[0m\r\n",
      "\u001b[01;35mdepth-wm_ShiftDist.jpg\u001b[0m  pial_ShiftDist.fig   wm.csv\r\n",
      "dural.csv               \u001b[01;35mpial_ShiftDist.jpg\u001b[0m   wm_ShiftDist.fig\r\n",
      "dural_ShiftDist.fig     raw.csv              \u001b[01;35mwm_ShiftDist.jpg\u001b[0m\r\n"
     ]
    }
   ],
   "source": [
    "ls \"/storage/datasets/neuroscience/ecog/data-by-subject/sub3/localization\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "id": "25d053eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_all_electrodes(subject):\n",
    "    '''\n",
    "        returns list of electrodes in this subject and trial\n",
    "        NOTE: the order of these labels is important. Their position corresponds with a row in data.h5\n",
    "    '''\n",
    "    trial = \"trial000\" #Assume that every trial contains the same electrodes\n",
    "    dataset_dir = \"/storage/datasets/neuroscience/ecog\"#TODO hardcoded path\n",
    "    headers_dir = os.path.join(dataset_dir, f'data-by-subject/{subject}/data/trials/{trial}/headers')\n",
    "\n",
    "    def get_string_from_hdf5_reference(f, ref):\n",
    "        return ''.join(chr(i.item()) for i in f[ref[0]][:])\n",
    "\n",
    "    header_file_name = os.listdir(headers_dir)[0]\n",
    "    header_file = h5py.File(os.path.join(headers_dir, header_file_name), 'r')\n",
    "    electrode_labels = [get_string_from_hdf5_reference(header_file, ref) for ref in header_file['channel_labels']]\n",
    "\n",
    "    strip_string = lambda x: x.replace(\"*\",\"\").replace(\"#\",\"\").replace(\"_\",\"\")\n",
    "    electrode_labels = [strip_string(e) for e in electrode_labels]\n",
    "    return electrode_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 309,
   "id": "abe16e65",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_elec_subjs, all_elecs = [], []\n",
    "for subj in subjs:\n",
    "    electrodes = get_all_electrodes(subj)\n",
    "    electrodes = [e for e in electrodes if e not in [\"DC4\", \"DC10\", \"TRIG4\"]]\n",
    "    all_elec_subjs += [subj]*len(electrodes)\n",
    "    all_elecs += electrodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "id": "c5b9a8ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "elec_df = pd.DataFrame({\"elec\": all_elecs, \"subj\": all_elec_subjs})\n",
    "elec_df[\"count\"] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 324,
   "id": "ea8c2110",
   "metadata": {},
   "outputs": [],
   "source": [
    "elec_counts = list(elec_df.groupby(\"subj\").sum()[\"count\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 326,
   "id": "b94dec50",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(168.8, 39.73467559254063)"
      ]
     },
     "execution_count": 326,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "elec_counts.mean(), elec_counts.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51b66abd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
