{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c037de70",
   "metadata": {},
   "outputs": [],
   "source": [
    "import h5py\n",
    "import hdf5plugin\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22094bf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata = pd.read_csv(\"metadata.csv\")\n",
    "cite_meta = metadata[metadata[\"technology\"] == \"citeseq\"]\n",
    "cite_one_donor_meta = cite_meta[cite_meta[\"donor\"] == 13176]\n",
    "cite_one_donor_one_cell = cite_one_donor_meta\n",
    "s = set(cite_one_donor_one_cell[\"cell_id\"].values.tolist())\n",
    "\n",
    "data = []\n",
    "\n",
    "h5_files = [h5py.File(\"train_cite_inputs.h5\",'r'), h5py.File(\"test_cite_inputs.h5\",'r')]\n",
    "\n",
    "for name, f in zip([\"train_cite_inputs\", \"test_cite_inputs\"], h5_files):\n",
    "    inds_to_extract = []\n",
    "    extracted_ids = []\n",
    "\n",
    "    for i, elem in tqdm(enumerate(f[f'{name}/axis1'][:])):\n",
    "        elem = elem.decode(\"utf-8\")\n",
    "        if elem in s:\n",
    "            inds_to_extract.append(i)\n",
    "            extracted_ids.append(elem)\n",
    "\n",
    "    extracted_data = f[f'{name}/block0_values'][inds_to_extract[:]]\n",
    "    \n",
    "    data.append((extracted_data, extracted_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2056ec92",
   "metadata": {},
   "outputs": [],
   "source": [
    "cell_ids = np.array(data[0][1] + data[1][1])\n",
    "genes_counts = np.concatenate([data[0][0], data[1][0]], axis=0)\n",
    "\n",
    "n_components = 1000\n",
    "\n",
    "pca = PCA(n_components=n_components)\n",
    "\n",
    "pca.fit(genes_counts)\n",
    "\n",
    "pcas = pca.transform(genes_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8043dae",
   "metadata": {},
   "outputs": [],
   "source": [
    "for day in [2, 3, 4, 7]:\n",
    "\n",
    "    day_ids = set(cite_one_donor_one_cell[cite_one_donor_one_cell[\"day\"] == day][\"cell_id\"].values.tolist())\n",
    "\n",
    "    day_inds = []\n",
    "\n",
    "    for i, elem in enumerate(tqdm(cell_id)):\n",
    "        if elem in day_ids:\n",
    "            day_inds.append(i)\n",
    "\n",
    "    day_pcas = pcas[day_inds]\n",
    "    \n",
    "    np.save(f\"full_cite_pcas_{n_components}_day_{day}\", day_pcas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64f4c181",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
