{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns\n",
    "import sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "data_fn = '../datasets/sehstr/block_18_stop6.pkl'\n",
    "with open(data_fn, 'rb') as f:\n",
    "  x_to_r = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "json_fn = '../datasets/sehstr/block_18.json'\n",
    "blocks_df = pd.read_json(json_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['000000', '000001', '000002', '000003', '000004', '000005', '000006', '000007', '000008', '000009']\n"
     ]
    }
   ],
   "source": [
    "xs = list(x_to_r.keys())\n",
    "print(xs[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n"
     ]
    }
   ],
   "source": [
    "# Featurize\n",
    "\n",
    "symbols = '0123456789abcdefghijklmnopqrstuvwxyz' + \\\n",
    "              'ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\\()*+,-./:;<=>?@[\\]^_`{|}~'\n",
    "num_blocks = len(blocks_df)\n",
    "\n",
    "import functools\n",
    "\n",
    "@functools.cache\n",
    "def symbol_ohe(symbol):\n",
    "  zs = np.zeros(num_blocks)\n",
    "  zs[symbols.index(symbol)] = 1.0\n",
    "  return zs\n",
    "\n",
    "print(symbol_ohe('1'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
      " 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n"
     ]
    }
   ],
   "source": [
    "# Featurization\n",
    "\n",
    "def featurize(x):\n",
    "  return np.concatenate([symbol_ohe(c) for c in x])\n",
    "\n",
    "print(featurize(xs[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = np.array(list(x_to_r.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 34012224/34012224 [01:58<00:00, 287838.93it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "X = []\n",
    "for x in tqdm(xs):\n",
    "  X.append(featurize(x))\n",
    "X = np.array(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.808403188596164\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import HistGradientBoostingRegressor\n",
    "\n",
    "# N_SUBSET = 1000000\n",
    "N_SUBSET = len(X)\n",
    "\n",
    "model = HistGradientBoostingRegressor()\n",
    "model.fit(X[:N_SUBSET], Y[:N_SUBSET])\n",
    "print(model.score(X[:N_SUBSET], Y[:N_SUBSET]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.9013709040275963, 0.0)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from scipy.stats import pearsonr\n",
    "\n",
    "PEARSONR_SUBSET = 1000000\n",
    "\n",
    "pearsonr(model.predict(X[:PEARSONR_SUBSET]), Y[:PEARSONR_SUBSET])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved to file.\n"
     ]
    }
   ],
   "source": [
    "with open('sehstr_gbtr.pkl', 'wb') as f:\n",
    "  pickle.dump(model, f)\n",
    "print('Saved to file.')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gfn-substructure",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "5cd6857374bd741ac5516a7526edf2288815f16491bdff1aaecce98b4a9229f8"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
