{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os.path as osp\n",
    "import torch\n",
    "from tqdm import tqdm\n",
    "from joblib.parallel import Parallel, delayed\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "from molecules.parse_sdf import get_num_atoms_from_smiles\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEED = 42\n",
    "ROOT = \".\"\n",
    "DATASET = osp.join(ROOT, \"pcqm4m-v2-geometry-normed.csv.gz\")\n",
    "VAL_RATIO = 0.05\n",
    "TEST_RATIO = 0.05\n",
    "SHUFFLE_DIR = osp.join(ROOT, \"shuffle_split_dict.pt\")\n",
    "NUMATOMS_DIR = osp.join(ROOT, \"num_atoms_split_dict.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0          O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C\n",
       "1                  COc1cc(OC)ccc1/C=C/N(C(=O)C)C\n",
       "2                    C=CCN(C(=O)C)/C=C/c1ccccc1C\n",
       "3                    C=CCN(C(=O)C)/C=C/c1ccccc1F\n",
       "4                   C=CCN(C(=O)C)/C=C/c1ccccc1Cl\n",
       "                           ...                  \n",
       "3378521    Cc1ccc(c(c1)C)N[C@H](/C(=N\\C1CC1)/O)C\n",
       "3378522     C[C@@H](/C(=N\\C1CC1)/O)Nc1cccc(c1C)C\n",
       "3378523     C[C@H](/C(=N\\C(=N)O)/O)Nc1cccc(c1C)C\n",
       "3378524    C[C@@H](/C(=N\\C(=N)O)/O)Nc1cccc(c1C)C\n",
       "3378525              CCOc1ccccc1NC/C(=N\\C1CC1)/O\n",
       "Name: smiles, Length: 3378526, dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "smiles = pd.read_csv(DATASET, nrows=None, usecols=[\"smiles\"])[\"smiles\"]\n",
    "smiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 50%|████▉     | 1674892/3378526 [00:15<00:15, 112748.03it/s][19:44:18] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:18] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:18] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:18] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:18] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:18] WARNING: not removing hydrogen atom without neighbors\n",
      " 54%|█████▎    | 1808491/3378526 [00:16<00:15, 101366.30it/s][19:44:19] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:19] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:19] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:19] WARNING: not removing hydrogen atom without neighbors\n",
      " 58%|█████▊    | 1943535/3378526 [00:17<00:12, 117184.34it/s][19:44:20] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:20] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:20] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:20] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:20] WARNING: not removing hydrogen atom without neighbors\n",
      "[19:44:20] WARNING: not removing hydrogen atom without neighbors\n",
      " 58%|█████▊    | 1964421/3378526 [00:17<00:10, 140620.49it/s][19:44:20] Conflicting single bond directions around double bond at index 13.\n",
      "[19:44:20]   BondStereo set to STEREONONE and single bond directions set to NONE.\n",
      " 62%|██████▏   | 2098547/3378526 [00:18<00:09, 134022.51it/s][19:44:21] Conflicting single bond directions around double bond at index 11.\n",
      "[19:44:21]   BondStereo set to STEREONONE and single bond directions set to NONE.\n",
      "100%|██████████| 3378526/3378526 [00:29<00:00, 113587.54it/s]\n"
     ]
    }
   ],
   "source": [
    "num_atoms = Parallel(n_jobs=-1)(delayed(get_num_atoms_from_smiles)(s) for s in tqdm(smiles))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_shuffle_split(N, val_ratio, test_ratio):\n",
    "    \"\"\" Create a random shuffle split and saves it to disk.\n",
    "    Args:\n",
    "        N: Total size of the dataset to split.\n",
    "    \"\"\"\n",
    "    rng = np.random.default_rng(seed=SEED)\n",
    "    all_ind = rng.permutation(N)\n",
    "    train_ratio = 1 - val_ratio - test_ratio\n",
    "    val_ratio_rem = val_ratio / (val_ratio + test_ratio)\n",
    "\n",
    "    # Random shuffle split into 90/5/5.\n",
    "    train_ind, tmp_ind = all_ind[:int(train_ratio * N)], all_ind[int(train_ratio * N):]\n",
    "    val_ind = tmp_ind[:int(val_ratio_rem * len(tmp_ind))]\n",
    "    test_ind = tmp_ind[int((1-val_ratio_rem) * len(tmp_ind)):]\n",
    "    assert check_splits(N, [train_ind, val_ind, test_ind], [train_ratio, val_ratio, test_ratio])\n",
    "\n",
    "    shuffle_split = {'train': train_ind, 'val': val_ind, 'test': test_ind}\n",
    "    torch.save(shuffle_split, SHUFFLE_DIR)\n",
    "\n",
    "def create_numatoms_split(num_atoms_list, val_ratio, test_ratio):\n",
    "    \"\"\" Create split by the size of molecules, testing on the largest ones.\n",
    "    Args:\n",
    "        num_atoms_list: List with molecule size per each graph.\n",
    "    \"\"\"\n",
    "    rng = np.random.default_rng(seed=SEED)\n",
    "    all_ind = np.argsort(np.array(num_atoms_list))\n",
    "    train_ratio = 1 - val_ratio - test_ratio\n",
    "    val_ratio_rem = val_ratio / (val_ratio + test_ratio)\n",
    "\n",
    "    # Split based on mol size into 90/5/5, but shuffle the top 10% randomly\n",
    "    # before splitting to validation and test set.\n",
    "    N = len(num_atoms_list)\n",
    "    train_ind, tmp_ind = all_ind[:int(train_ratio * N)], all_ind[int(train_ratio * N):]\n",
    "    rng.shuffle(tmp_ind)\n",
    "    val_ind = tmp_ind[:int(val_ratio_rem * len(tmp_ind))]\n",
    "    test_ind = tmp_ind[int((1-val_ratio_rem) * len(tmp_ind)):]\n",
    "    assert len(train_ind) + len(val_ind) + len(test_ind) == N\n",
    "    assert check_splits(N, [train_ind, val_ind, test_ind], [train_ratio, val_ratio, test_ratio])\n",
    "\n",
    "    size_split = {'train': train_ind, 'val': val_ind, 'test': test_ind}\n",
    "    torch.save(size_split, osp.join(ROOT, NUMATOMS_DIR))\n",
    "\n",
    "def check_splits(N, splits, ratios):\n",
    "    \"\"\" Check whether splits intersect and raise error if so.\n",
    "    \"\"\"\n",
    "    assert sum([len(split) for split in splits]) == N\n",
    "    for ii, split in enumerate(splits):\n",
    "        true_ratio = len(split) / N\n",
    "        assert abs(true_ratio - ratios[ii]) < 3/N\n",
    "    for i in range(len(splits) - 1):\n",
    "        for j in range(i + 1, len(splits)):\n",
    "            n_intersect = len(set(splits[i]) & set(splits[j]))\n",
    "            if n_intersect != 0:\n",
    "                raise ValueError(\n",
    "                    f\"Splits must not have intersecting indices: \"\n",
    "                    f\"split #{i} (n = {len(splits[i])}) and \"\n",
    "                    f\"split #{j} (n = {len(splits[j])}) have \"\n",
    "                    f\"{n_intersect} intersecting indices\"\n",
    "                )\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_numatoms_split(num_atoms, VAL_RATIO, TEST_RATIO)\n",
    "create_shuffle_split(len(smiles), VAL_RATIO, TEST_RATIO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_atoms_split = torch.load(NUMATOMS_DIR)\n",
    "shuffle_split = torch.load(SHUFFLE_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.legend.Legend at 0x7f239fde5820>"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD4CAYAAADy46FuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAcKklEQVR4nO3df5BV5Z3n8fdH6AAGRGh+pENrmihlVjFBpZCs2ZQZJvzQiZgtTfXEjF1ZNp0xxDKpJCvslMFoqMJJbXTYXUy5KysaifSScaWijCBiyFYQbFwSfqjTbSTSQqBDA8EkOMJ894/7tN5ubp++NPS9Df15Vd26537PeZ773EPDh3Oec08rIjAzM+vKOeUegJmZ9W0OCjMzy+SgMDOzTA4KMzPL5KAwM7NMA8s9gNNt1KhRUVNTU+5hmJmdUbZs2fL7iBhdaN1ZFxQ1NTU0NjaWexhmZmcUSb/tal1Rp54knS9ppaRXJb0i6ZOSRkpaK6kpPY/I236+pGZJr0makVe/StK2tG6xJKX6IEkrUn2TpJq8NnXpPZok1fVoD5iZWY8VO0fxD8A/RcTHgE8ArwDzgHURMQFYl14j6VKgFrgMmAkskTQg9fMgUA9MSI+ZqT4HOBgRFwP3A/elvkYCC4CrgSnAgvxAMjOz3tdtUEg6D/g08DBARPxLRBwCZgPL0mbLgBvT8mzgiYh4JyLeAJqBKZKqgPMiYmPkvg7+aKc27X2tBKalo40ZwNqIaIuIg8Ba3g8XMzMrgWLmKD4KtAL/S9IngC3AHcDYiNgLEBF7JY1J248DXsxr35Jq76blzvX2NrtTX8ckHQYq8+sF2hTt3XffpaWlhaNHj55s0zPO4MGDqa6upqKiotxDMbOzRDFBMRC4Erg9IjZJ+gfSaaYuqEAtMuo9bfP+G0r15E5pceGFF57QoKWlhWHDhlFTU0OaFjkrRQQHDhygpaWF8ePHl3s4ZnaWKGaOogVoiYhN6fVKcsGxL51OIj3vz9v+grz21cCeVK8uUO/QRtJAYDjQltFXBxHxUERMjojJo0efeHXX0aNHqaysPKtDAkASlZWV/eLIycxKp9ugiIjfAbslXZJK04CdwCqg/SqkOuCptLwKqE1XMo0nN2m9OZ2mOiJpapp/uLVTm/a+bgKeT/MYzwLTJY1Ik9jTU+2kne0h0a6/fE4zK51iv0dxO/C4pA8AvwG+TC5kGiTNAd4EbgaIiB2SGsiFyTFgbkQcT/3cBjwCDAFWpwfkJsofk9RM7kiiNvXVJule4KW03T0R0dbDz2pmZj1QVFBExFZgcoFV07rYfiGwsEC9EZhYoH6UFDQF1i0FlhYzzmLVzHv6dHbHrkXXd7vNoUOHWL58OV/72tdOqu/rrruO5cuXc/755/dwdGZmp+as+2Z2X3Xo0CGWLFlyQlAcP36cAQMGdNEKnnnmmd4empkV0Pk/lMX8h/Bs5aAokXnz5vH6668zadIkKioqGDp0KFVVVWzdupWdO3dy4403snv3bo4ePcodd9xBfX098P4tSd5++21mzZrFpz71KX75y18ybtw4nnrqKYYMGVLmT2ZmZzvfPbZEFi1axEUXXcTWrVv5wQ9+wObNm1m4cCE7d+4EYOnSpWzZsoXGxkYWL17MgQMHTuijqamJuXPnsmPHDs4//3x++tOflvpjmFk/5COKMpkyZUqH7zosXryYJ598EoDdu3fT1NREZWVlhzbjx49n0qRJAFx11VXs2rWrVMM1s37MQVEmH/zgB99bfuGFF3juuefYuHEj5557Ltdee23B70IMGjToveUBAwbw5z//uSRjNbP+zaeeSmTYsGEcOXKk4LrDhw8zYsQIzj33XF599VVefPHFgtuZmZVDvzyiKMfVC5WVlVxzzTVMnDiRIUOGMHbs2PfWzZw5kx/96Ed8/OMf55JLLmHq1KklH5/Z2cZXLZ0+/TIoymX58uUF64MGDWL16tUF17XPQ4waNYrt27e/V//2t7992sdnZlaITz2ZmVkmB4WZmWVyUJiZWSYHhZmZZXJQmJlZJgeFmZll6p+Xx949/DT3d/j09gcMHTqUt99++7T3a2Z2snxEYWZmmfrnEUUZ3HnnnXzkIx957/dR3H333Uhiw4YNHDx4kHfffZfvf//7zJ49u8wjNTPryEcUJVJbW8uKFSvee93Q0MCXv/xlnnzySV5++WXWr1/Pt771LXK/KtzMrO/wEUWJXHHFFezfv589e/bQ2trKiBEjqKqq4pvf/CYbNmzgnHPO4a233mLfvn186EMfKvdwzcze46AooZtuuomVK1fyu9/9jtraWh5//HFaW1vZsmULFRUV1NTUFLy9uJlZOTkoSqi2tpavfOUr/P73v+fnP/85DQ0NjBkzhoqKCtavX89vf/vbcg/RzOwE/TMoeuFy1mJcdtllHDlyhHHjxlFVVcUtt9zC5z73OSZPnsykSZP42Mc+VpZxmZll6Z9BUUbbtm17b3nUqFFs3Lix4Hb+DoWZ9RW+6snMzDI5KMzMLJODwszMMjkozMwsU1FBIWmXpG2StkpqTLWRktZKakrPI/K2ny+pWdJrkmbk1a9K/TRLWixJqT5I0opU3ySpJq9NXXqPJkl1p+2Tm5lZUU7miOIzETEpIian1/OAdRExAViXXiPpUqAWuAyYCSyRNCC1eRCoByakx8xUnwMcjIiLgfuB+1JfI4EFwNXAFGBBfiCZmVnvO5XLY2cD16blZcALwJ2p/kREvAO8IakZmCJpF3BeRGwEkPQocCOwOrW5O/W1Evhv6WhjBrA2ItpSm7XkwuUnpzBuLl92+ak0P8G2um3dbnPo0CGWL1/+3k0BT8YDDzxAfX095557bk+GZ2Z2Soo9oghgjaQtkupTbWxE7AVIz2NSfRywO69tS6qNS8ud6x3aRMQx4DBQmdFXB5LqJTVKamxtbS3yI5XWoUOHWLJkSY/aPvDAA/zpT386zSMyMytOsUcU10TEHkljgLWSXs3YVgVqkVHvaZv3CxEPAQ8BTJ48uU/efnXevHm8/vrrTJo0ic9+9rOMGTOGhoYG3nnnHT7/+c/zve99jz/+8Y984QtfoKWlhePHj3PXXXexb98+9uzZw2c+8xlGjRrF+vXry/1RzHpdzbynT6jtWnR9GUZiUGRQRMSe9Lxf0pPk5gv2SaqKiL2SqoD9afMW4IK85tXAnlSvLlDPb9MiaSAwHGhL9Ws7tXmh2A/XlyxatIjt27ezdetW1qxZw8qVK9m8eTMRwQ033MCGDRtobW3lwx/+ME8/nftLcvjwYYYPH84Pf/hD1q9fz6hRo8r8KcysP+r21JOkD0oa1r4MTAe2A6uA9quQ6oCn0vIqoDZdyTSe3KT15nR66oikqWn+4dZObdr7ugl4PnK/mOFZYLqkEWkSe3qqndHWrFnDmjVruOKKK7jyyit59dVXaWpq4vLLL+e5557jzjvv5Be/+AXDh5/mX9lqZtYDxRxRjAWeTFeyDgSWR8Q/SXoJaJA0B3gTuBkgInZIagB2AseAuRFxPPV1G/AIMITcJPbqVH8YeCxNfLeRu2qKiGiTdC/wUtrunvaJ7TNZRDB//ny++tWvnrBuy5YtPPPMM8yfP5/p06fz3e9+twwjNDN7X7dBERG/AT5RoH4AmNZFm4XAwgL1RmBigfpRUtAUWLcUWNrdOPu6YcOGceTIEQBmzJjBXXfdxS233MLQoUN56623qKio4NixY4wcOZIvfelLDB06lEceeaRDW596MrNy6Jd3jy3mctbTrbKykmuuuYaJEycya9YsvvjFL/LJT34SgKFDh/LjH/+Y5uZmvvOd73DOOedQUVHBgw8+CEB9fT2zZs2iqqrKk9lmVnL9MijKZfny5R1e33HHHR1eX3TRRcyYMYPObr/9dm6//fZeHZuZWVd8ryczM8vkoDAzs0z9JihyV9ue/frL5zSz0ukXQTF48GAOHDhw1v8jGhEcOHCAwYMHl3soZnYW6ReT2dXV1bS0tNBX7wN1Og0ePJjq6uruNzQzK1K/CIqKigrGjx9f7mGYmZ2R+sWpJzMz6zkHhZmZZXJQmJlZJgeFmZllclCYmVkmB4WZmWVyUJiZWSYHhZmZZXJQmJlZJgeFmZllclCYmVkmB4WZmWVyUJiZWSYHhZmZZXJQmJlZJgeFmZllclCYmVkmB4WZmWUqOigkDZD0/yT9LL0eKWmtpKb0PCJv2/mSmiW9JmlGXv0qSdvSusWSlOqDJK1I9U2SavLa1KX3aJJUd1o+tZmZFe1kjijuAF7Jez0PWBcRE4B16TWSLgVqgcuAmcASSQNSmweBemBCesxM9TnAwYi4GLgfuC/1NRJYAFwNTAEW5AeSmZn1vqKCQlI1cD3wP/PKs4FlaXkZcGNe/YmIeCci3gCagSmSqoDzImJjRATwaKc27X2tBKalo40ZwNqIaIuIg8Ba3g8XMzMrgWKPKB4A/hPwr3m1sRGxFyA9j0n1ccDuvO1aUm1cWu5c79AmIo4Bh4HKjL46kFQvqVFSY2tra5EfyczMitFtUEj6K2B/RGwpsk8VqEVGvadt3i9EPBQRkyNi8ujRo4scppmZFaOYI4prgBsk7QKeAP5C0o+Bfel0Eul5f9q+Bbggr301sCfVqwvUO7SRNBAYDrRl9GVmZiXSbVBExPyIqI6IGnKT1M9HxJeAVUD7VUh1wFNpeRVQm65kGk9u0npzOj11RNLUNP9wa6c27X3dlN4jgGeB6ZJGpEns6almZmYlMvAU2i4CGiTNAd4EbgaIiB2SGoCdwDFgbkQcT21uAx4BhgCr0wPgYeAxSc3kjiRqU19tku4FXkrb3RMRbacwZjMzO0knFRQR8QLwQlo+AEzrYruFwMIC9UZgYoH6UVLQFFi3FFh6MuM0M7PTx9/MNjOzTA4KMzPL5KAwM7NMDgozM8vkoDAzs0wOCjMzy3Qq36MwMyuoZt7THV7vWnR9mUZip4OPKMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMnUbFJIGS9os6VeSdkj6XqqPlLRWUlN6HpHXZr6kZkmvSZqRV79K0ra0brEkpfogSStSfZOkmrw2dek9miTVndZPb2Zm3SrmiOId4C8i4hPAJGCmpKnAPGBdREwA1qXXSLoUqAUuA2YCSyQNSH09CNQDE9JjZqrPAQ5GxMXA/cB9qa+RwALgamAKsCA/kMzMrPd1GxSR83Z6WZEeAcwGlqX6MuDGtDwbeCIi3omIN4BmYIqkKuC8iNgYEQE82qlNe18rgWnpaGMGsDYi2iLiILCW98PFzMxKoKg5CkkDJG0F9pP7h3sTMDYi9gKk5zFp83HA7rzmLak2Li13rndoExHHgMNAZUZfncdXL6lRUmNra2sxH8nMzIpUVFBExPGImARUkzs6mJixuQp1kVHvaZv88T0UEZMjYvLo0aMzhmZmZifrpK56iohDwAvkTv/sS6eTSM/702YtwAV5zaqBPaleXaDeoY2kgcBwoC2jLzMzK5FirnoaLen8tDwE+EvgVWAV0H4VUh3wVFpeBdSmK5nGk5u03pxOTx2RNDXNP9zaqU17XzcBz6d5jGeB6ZJGpEns6almZmYlMrCIbaqAZenKpXOAhoj4maSNQIOkOcCbwM0AEbFDUgOwEzgGzI2I46mv24BHgCHA6vQAeBh4TFIzuSOJ2tRXm6R7gZfSdvdERNupfGAzMzs53QZFRPwauKJA/QAwrYs2C4GFBeqNwAnzGxFxlBQ0BdYtBZZ2N04zM+sd/ma2mZllclCYmVkmB4WZmWVyUJiZWaZirnoys36mZt7THV7vWnR9mUZifYGPKMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCyT7/VkZtYLzqb7ZfmIwszMMjkozMwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwydRsUki6QtF7SK5J2SLoj1UdKWiupKT2PyGszX1KzpNckzcirXyVpW1q3WJJSfZCkFam+SVJNXpu69B5NkupO66c3M7NuFXNEcQz4VkT8G2AqMFfSpcA8YF1ETADWpdekdbXAZcBMYImkAamvB4F6YEJ6zEz1OcDBiLgYuB+4L/U1ElgAXA1MARbkB5KZmfW+boMiIvZGxMtp+QjwCjAOmA0sS5stA25My7OBJyLinYh4A2gGpkiqAs6LiI0REcCjndq097USmJaONmYAayOiLSIOAmt5P1zMzKwETuqmgOmU0BXAJmBsROyFXJhIGpM2Gwe8mNesJdXeTcud6+1tdqe+jkk6DFTm1wu0yR9XPbkjFS688MKT+UhmZ6Wz6YZ0Vn5FT2ZLGgr8FPhGRPwha9MCtcio97TN+4WIhyJickRMHj16dMbQzMzsZBUVFJIqyIXE4xHxj6m8L51OIj3vT/UW4IK85tXAnlSvLlDv0EbSQGA40JbRl5mZlUgxVz0JeBh4JSJ+mLdqFdB+FVId8FRevTZdyTSe3KT15nSa6oikqanPWzu1ae/rJuD5NI/xLDBd0og0iT091czMrESKmaO4BvgbYJukran2n4FFQIOkOcCbwM0AEbFDUgOwk9wVU3Mj4nhqdxvwCDAEWJ0ekAuixyQ1kzuSqE19tUm6F3gpbXdPRLT17KOamVlPdBsUEfF/KTxXADCtizYLgYUF6o3AxAL1o6SgKbBuKbC0u3GamVnv8Dezzcwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCxTMb+4yMxKrGbe0x1e71p0fZlGYuagMDMrjbuHn1C6fPyFHV5vq9tWqtGcFJ96MjOzTA4KMzPL5KAwM7NMDgozM8vkoDAzs0wOCjMzy+SgMDOzTA4KMzPL5KAwM7NM3QaFpKWS9kvanlcbKWmtpKb0PCJv3XxJzZJekzQjr36VpG1p3WJJSvVBklak+iZJNXlt6tJ7NEmqO22f2szMilbMEcUjwMxOtXnAuoiYAKxLr5F0KVALXJbaLJE0ILV5EKgHJqRHe59zgIMRcTFwP3Bf6msksAC4GpgCLMgPJDMzK41ugyIiNgBtncqzgWVpeRlwY179iYh4JyLeAJqBKZKqgPMiYmNEBPBopzbtfa0EpqWjjRnA2ohoi4iDwFpODCwzM+tlPZ2jGBsRewHS85hUHwfsztuuJdXGpeXO9Q5tIuIYcBiozOjrBJLqJTVKamxtbe3hRzIzs0JO92S2CtQio97TNh2LEQ9FxOSImDx69OiiBmpmZsXpaVDsS6eTSM/7U70FuCBvu2pgT6pXF6h3aCNpIDCc3KmurvoyM7MS6mlQrALar0KqA57Kq9emK5nGk5u03pxOTx2RNDXNP9zaqU17XzcBz6d5jGeB6ZJGpEns6almZmYl1O0vLpL0E+BaYJSkFnJXIi0CGiTNAd4EbgaIiB2SGoCdwDFgbkQcT13dRu4KqiHA6vQAeBh4TFIzuSOJ2tRXm6R7gZfSdvdEROdJdTMz62XdBkVE/HUXq6Z1sf1CYGGBeiMwsUD9KCloCqxbCiztboxmfUnnX2MK/lWmdmbzN7PNzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwyOSjMzCyTg8LMzDI5KMzMLJODwszMMjkozMwsk4PCzMwydXtTQLP+xjf1M+vIRxRmZpbJQWFmZpkcFGZmlslBYWZmmRwUZmaWyUFhZmaZfHmsmdmZ4O7hBWqHS/LWDgozszPU5csu7/B6W922XnkfB4WddTp/Yc5fljM7NZ6jMDOzTA4KMzPL5KAwM7NMDgozM8vkyWzrczwZbda3nBFHFJJmSnpNUrOkeeUej5lZf9LnjygkDQD+O/BZoAV4SdKqiNhZ3pFZV3xEYHZ26fNBAUwBmiPiNwCSngBmAw6KXuBf2mNmnSkiyj2GTJJuAmZGxH9Mr/8GuDoivp63TT1Qn15eArzWg7caBfz+FIfb2/r6GD2+U9fXx9jXxwd9f4x9dXwfiYjRhVacCUcUKlDrkG4R8RDw0Cm9idQYEZNPpY/e1tfH6PGdur4+xr4+Puj7Y+zr4yvkTJjMbgEuyHtdDewp01jMzPqdMyEoXgImSBov6QNALbCqzGMyM+s3+vypp4g4JunrwLPAAGBpROzohbc6pVNXJdLXx+jxnbq+Psa+Pj7o+2Ps6+M7QZ+fzDYzs/I6E049mZlZGTkozMwsU78Liu5uB6KcxWn9ryVdWcKxXSBpvaRXJO2QdEeBba6VdFjS1vT4bqnGlzeGXZK2pfdvLLC+nPvwkrx9s1XSHyR9o9M2Jd+HkpZK2i9pe15tpKS1kprS84gu2vb6LWy6GN8PJL2a/gyflHR+F20zfx56eYx3S3or78/yui7almsfrsgb2y5JW7toW5J92GMR0W8e5CbDXwc+CnwA+BVwaadtrgNWk/v+xlRgUwnHVwVcmZaHAf9cYHzXAj8r837cBYzKWF+2fVjgz/t35L5IVNZ9CHwauBLYnlf7e2BeWp4H3NfFZ8j8me3F8U0HBqbl+wqNr5ifh14e493At4v4OSjLPuy0/r8A3y3nPuzpo78dUbx3O5CI+Beg/XYg+WYDj0bOi8D5kqpKMbiI2BsRL6flI8ArwLhSvPdpVrZ92Mk04PWI+G0Z3ruDiNgAtHUqzwaWpeVlwI0FmhbzM9sr44uINRFxLL18kdx3mMqmi31YjLLtw3aSBHwB+Mnpft9S6G9BMQ7Ynfe6hRP/IS5mm14nqQa4AthUYPUnJf1K0mpJl5V2ZEDum/FrJG1Jt0/prE/sQ3LfuenqL2a59yHA2IjYC7n/JABjCmzTV/blfyB3lFhIdz8Pve3r6fTY0i5O3/WFffjvgH0R0dTF+nLvw0z9LSi6vR1Ikdv0KklDgZ8C34iIP3Ra/TK5UymfAP4r8H9KObbkmoi4EpgFzJX06U7r+8I+/ABwA/C/C6zuC/uwWH1hX/4dcAx4vItNuvt56E0PAhcBk4C95E7vdFb2fQj8NdlHE+Xch93qb0FRzO1AynrLEEkV5ELi8Yj4x87rI+IPEfF2Wn4GqJA0qlTjS++7Jz3vB54kd2ifry/cdmUW8HJE7Ou8oi/sw2Rf+ym59Ly/wDbl/nmsA/4KuCXSyfTOivh56DURsS8ijkfEvwL/o4v3Lvc+HAj8e2BFV9uUcx8Wo78FRTG3A1kF3Jqu3JkKHG4/PdDb0nnMh4FXIuKHXWzzobQdkqaQ+zM8UIrxpff8oKRh7cvkJjy3d9qsbPswT5f/gyv3PsyzCqhLy3XAUwW2KdstbCTNBO4EboiIP3WxTTE/D705xvy5r8938d7lvg3QXwKvRkRLoZXl3odFKfdseqkf5K7I+WdyV0H8Xar9LfC3aVnkflHS68A2YHIJx/YpcofEvwa2psd1ncb3dWAHuSs3XgT+bYn330fTe/8qjaNP7cP0/ueS+4d/eF6trPuQXGjtBd4l9z/cOUAlsA5oSs8j07YfBp7J+pkt0fiayZ3bb/9Z/FHn8XX181DCMT6WfsZ+Te4f/6q+tA9T/ZH2n728bcuyD3v68C08zMwsU3879WRmZifJQWFmZpkcFGZmlslBYWZmmRwUZmaWyUFhZmaZHBRmZpbp/wPKeE8sEEV0SwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure()\n",
    "max_len = max([len(idx) for idx in num_atoms_split.values()])\n",
    "arr = []\n",
    "for idx in num_atoms_split.values():\n",
    "    this_arr = np.nan + np.zeros(max_len)\n",
    "    this_arr[:len(idx)] = np.array(num_atoms)[idx]\n",
    "    arr.append(this_arr)\n",
    "arr = np.stack(arr, axis=1)\n",
    "\n",
    "plt.hist(arr, stacked=False, bins=np.arange(0, np.nanmax(arr)), log=False)\n",
    "plt.legend(num_atoms_split.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.legend.Legend at 0x7f239fa71f10>"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAbFklEQVR4nO3dfZAU9b3v8fdHsooIUeRBN4DZPRalZSSuZoPkGlOmvCpgInqSWGg0lMeb1YoYTUnK9ZxKityTPzi5ibG8pXCxQl1yIhqOhisVN0eMhaFS0YSF2gioHBZrDQsIKwbE+Ih+7x/TkGGY3e3Zh5nZ7c+ramq6fw/d326G+W7/uqdbEYGZmWXPcZUOwMzMKsMJwMwso5wAzMwyygnAzCyjnADMzDLqY5UOoBTjx4+Purq6SodhZjakbNiw4fWImFBYPqQSQF1dHa2trZUOw8xsSJH0arFyDwGZmWWUE4CZWUY5AZiZZdSQOgdQzAcffEBnZyfvvvtupUMZVCNHjmTy5MnU1NRUOhQzGyaGfALo7OxkzJgx1NXVIanS4QyKiGDfvn10dnZSX19f6XDMbJgY8kNA7777LuPGjRu2X/4Akhg3btywP8oxs/JKlQAkzZS0VVK7pOYi9WdLek7Se5IW5JWfJakt7/WmpDuTuoWSdubVze7rRgznL//DsrCNZlZevQ4BSRoBPABcBnQC6yWtjogX85q9AXwbuDq/b0RsBRrylrMTWJXX5KcR8eN+xG9mZn2U5hzAdKA9Il4BkPQoMAc4kgAiYi+wV9KVPSznUmB7RBT9QcJAqWt+ckCX17Gop02C/fv3s2LFCr71rW+VtNzZs2ezYsUKTjnllH5EZ2bWd2mGgCYBO/LmO5OyUs0FHikomy/pBUnLJI0t1klSk6RWSa1dXV19WO3g2r9/Pw8++OAx5R9++GGP/VpaWvzlb1alpi2fdtRruEpzBFBs8Lmkx4hJOh64Crgnr3gx8K/Jsv4V+AnwT8esKGIpsBSgsbGx6h5f1tzczPbt22loaKCmpobRo0dTW1tLW1sbL774IldffTU7duzg3Xff5Y477qCpqQn4+20t3nrrLWbNmsXnP/95/vCHPzBp0iSeeOIJTjzxxApvmVmGLDz56Pn6MyoTR5mlSQCdwJS8+cnArhLXMwvYGBF7DhfkT0t6CPh1icusCosWLWLz5s20tbXx7LPPcuWVV7J58+Yjl2suW7aMU089lXfeeYfPfvazfOUrX2HcuHFHLWPbtm088sgjPPTQQ1x77bU8/vjj3HDDDZXYHLNhr9gwccfICgRSBdIMAa0HpkqqT/6SnwusLnE911Ew/COpNm/2GmBzicusStOnTz/qWv3777+f8847jxkzZrBjxw62bdt2TJ/6+noaGhoA+MxnPkNHR0eZojWzLOv1CCAiDkmaDzwFjACWRcQWSbcm9UsknQ60Ah8HPkou9TwnIt6UNIrcFUS3FCz6R5IayA0BdRSpH5JOOumkI9PPPvssv/3tb3nuuecYNWoUl1xySdFr+U844YQj0yNGjOCdd94pS6xmlm2pfgkcES1AS0HZkrzp18gNDRXr+zYwrkj5jSVFWqXGjBnDwYMHi9YdOHCAsWPHMmrUKF5++WWef/75MkdnZta9IX8riEK9XbY50MaNG8dFF13Eueeey4knnshpp512pG7mzJksWbKET3/605x11lnMmDGjrLGZmfVk2CWASlixYkXR8hNOOIHf/OY3ResOj/OPHz+ezZv/fvpjwYIFRdubmQ00JwAzG1IKr+LpGHn9MW2mFVzGuWnepkGNaaga8jeDMzOzvnECMDPLKCcAM7OMcgIwM8soJwAzs4waflcBFd7Uqd/LOzCgixs9ejRvvfXWgC7TzKwvfARgZpZRw+8IoMzuvvtuPvnJTx55IMzChQuRxLp16/jrX//KBx98wA9/+EPmzJlT4UjNzI7mI4B+mjt3Lr/85S+PzK9cuZKbbrqJVatWsXHjRtauXctdd91FRNU9ysDMMs5HAP10/vnns3fvXnbt2kVXVxdjx46ltraW73znO6xbt47jjjuOnTt3smfPHk4//fRKh2tmdoQTwAD46le/ymOPPcZrr73G3Llzefjhh+nq6mLDhg3U1NRQV1dX9DbQZmaV5AQwAObOncs3v/lNXn/9dX73u9+xcuVKJk6cSE1NDWvXruXVV1+tdIhmZscYfglggC/bTONTn/oUBw8eZNKkSdTW1vL1r3+dL3/5yzQ2NtLQ0MDZZ59d9pjMzHoz/BJAhWza9Pe7DY4fP57nnnuuaDv/BsDMqoWvAjIzyygnADOzjHICMDPLqFQJQNJMSVsltUtqLlJ/tqTnJL0naUFBXYekTZLaJLXmlZ8q6WlJ25L3sf3fHDMzS6vXk8CSRgAPAJcBncB6Sasj4sW8Zm8A3wau7mYxX4yI1wvKmoFnImJRklSagbtLjN/MrPIKb0JZgasR+yLNVUDTgfaIeAVA0qPAHOBIAoiIvcBeSVeWsO45wCXJ9HLgWZwAzKzKFT6TGKBjZAUCGQBpEsAkYEfefCdwYQnrCGCNpAD+T0QsTcpPi4jdABGxW9LEYp0lNQFNAGeccUaxJkeZtnxaCaH1rreHSe/fv58VK1YcuRlcKe677z6ampoYNWpUX8MzM+uzNOcAVKSslDubXRQRFwCzgNskfaGEvkTE0ohojIjGCRMmlNK1LPbv38+DDz7Yp7733Xcfb7/99gBHZGaVNm35tKNe1SrNEUAnMCVvfjKwK+0KImJX8r5X0ipyQ0rrgD2SapO//muBvenDrh7Nzc1s376dhoYGLrvsMiZOnMjKlSt57733uOaaa/jBD37A3/72N6699lo6Ozv58MMP+d73vseePXvYtWsXX/ziFxk/fjxr166t9KaYWcakSQDrgamS6oGdwFzg+jQLl3QScFxEHEymLwf+Z1K9GpgHLErenygx9qqwaNEiNm/eTFtbG2vWrOGxxx7jT3/6ExHBVVddxbp16+jq6uITn/gETz6ZGzs8cOAAJ598Mvfeey9r165l/PjxFd4Ks/IpHEPvGFnwdTJETqAOB70mgIg4JGk+8BQwAlgWEVsk3ZrUL5F0OtAKfBz4SNKdwDnAeGCVpMPrWhER/5ksehGwUtLNwF+Arw3ollXAmjVrWLNmDeeffz6Qu+3Dtm3buPjii1mwYAF33303X/rSl7j44osrHKlZ9So2ZNLbuTjrm1T3AoqIFqCloGxJ3vRr5IaGCr0JnNfNMvcBl6aOdAiICO655x5uueWWY+o2bNhAS0sL99xzD5dffjnf//73KxChmdnf+ZfA/TRmzBgOHjwIwBVXXMGyZcuO3PBt586dRx4WM2rUKG644QYWLFjAxo0bj+lrZlZuw+5uoOU+VBw3bhwXXXQR5557LrNmzeL666/nc5/7HACjR4/mF7/4Be3t7Xz3u9/luOOOo6amhsWLFwPQ1NTErFmzqK2t9UlgMyu7YZcAKmHFihVHzd9xxx1HzZ955plcccUVx/S7/fbbuf322wc1NjOz7ngIyMwso5wAzMwyalgkgIhSfpg8NGVhG82svIZ8Ahg5ciT79u0b1l+QEcG+ffsYOXKI3nHKzKrSkD8JPHnyZDo7O+nq6qp0KINq5MiRTJ5c7KcWZmZ9M+QTQE1NDfX19ZUOw8xsyBnyQ0BmZtY3TgBmZhnlBGBmllFOAGZmGeUEYGaWUU4AZmYZ5QRgZpZRTgBmZhnlBGBmllFOAGZmGeUEYGaWUU4AZmYZlSoBSJopaaukdknNRerPlvScpPckLcgrnyJpraSXJG2RdEde3UJJOyW1Ja/ZA7NJZmaWRq93A5U0AngAuAzoBNZLWh0RL+Y1ewP4NnB1QfdDwF0RsVHSGGCDpKfz+v40In7c340wM7PSpTkCmA60R8QrEfE+8CgwJ79BROyNiPXABwXluyNiYzJ9EHgJmDQgkZuZWb+kSQCTgB1585304UtcUh1wPvDHvOL5kl6QtEzS2G76NUlqldQ63B/6YmZWTmkSgIqUlfT8RUmjgceBOyPizaR4MXAm0ADsBn5SrG9ELI2IxohonDBhQimrNTOzHqRJAJ3AlLz5ycCutCuQVEPuy//hiPjV4fKI2BMRH0bER8BD5IaazMysTNIkgPXAVEn1ko4H5gKr0yxckoCfAS9FxL0FdbV5s9cAm9OFbGZmA6HXq4Ai4pCk+cBTwAhgWURskXRrUr9E0ulAK/Bx4CNJdwLnAJ8GbgQ2SWpLFvnPEdEC/EhSA7nhpA7glgHcLjMz60Wqh8InX9gtBWVL8qZfIzc0VOj3FD+HQETcmD5MMzMbaP4lsJlZRjkBmJlllBOAmVlGOQGYmWWUE4CZWUY5AZiZZZQTgJlZRjkBmJlllBOAmVlGOQGYmWWUE4CZWUY5AZiZZZQTgJlZRqW6G6iZ2WF1zU8eNd+x6Mqj5qctn3ZMn03zNg1qTNY3TgBm1j8LTz56vv6MysRhJfMQkJlZRjkBmJlllBOAmVlGOQGYmWWUE4CZWUalSgCSZkraKqldUnOR+rMlPSfpPUkL0vSVdKqkpyVtS97H9n9zzMwsrV4TgKQRwAPALOAc4DpJ5xQ0ewP4NvDjEvo2A89ExFTgmWTezMzKJM0RwHSgPSJeiYj3gUeBOfkNImJvRKwHPiih7xxgeTK9HLi6b5tgZmZ9kSYBTAJ25M13JmVp9NT3tIjYDZC8Tyy2AElNkloltXZ1daVcrZmZ9SZNAlCRski5/P70zTWOWBoRjRHROGHChFK6mplZD9IkgE5gSt78ZGBXyuX31HePpFqA5H1vymWamdkASJMA1gNTJdVLOh6YC6xOufye+q4G5iXT84An0odtZmb91evN4CLikKT5wFPACGBZRGyRdGtSv0TS6UAr8HHgI0l3AudExJvF+iaLXgSslHQz8BfgawO8bWZm1oNUdwONiBagpaBsSd70a+SGd1L1Tcr3AZeWEqyZmQ0c/xLYzCyjnADMzDLKCcDMLKOcAMzMMsoJwMwso5wAzMwyygnAzCyjnADMzDLKCcDMLKOcAMzMMsoJwMwso5wAzMwyygnAzCyjnADMzDLKCcDMLKOcAMzMMsoJwMwso1I9EczMhrGFJxfMH6hMHFZ2TgBmGVLX/OQxZR0jj56ftnzaUfOb5m0azJCsgjwEZGaWUU4AZmYZlSoBSJopaaukdknNReol6f6k/gVJFyTlZ0lqy3u9KenOpG6hpJ15dbMHdMvMzKxHvZ4DkDQCeAC4DOgE1ktaHREv5jWbBUxNXhcCi4ELI2Ir0JC3nJ3Aqrx+P42IHw/AdpiZWYnSHAFMB9oj4pWIeB94FJhT0GYO8PPIeR44RVJtQZtLge0R8Wq/ozYzs35LkwAmATvy5juTslLbzAUeKSibnwwZLZM0ttjKJTVJapXU2tXVlSJcMzNLI00CUJGyKKWNpOOBq4D/yKtfDJxJbohoN/CTYiuPiKUR0RgRjRMmTEgRrpmZpZEmAXQCU/LmJwO7SmwzC9gYEXsOF0TEnoj4MCI+Ah4iN9RkZmZlkiYBrAemSqpP/pKfC6wuaLMa+EZyNdAM4EBE7M6rv46C4Z+CcwTXAJtLjt7MzPqs16uAIuKQpPnAU8AIYFlEbJF0a1K/BGgBZgPtwNvATYf7SxpF7gqiWwoW/SNJDeSGijqK1JuZ2SBKdSuIiGgh9yWfX7YkbzqA27rp+zYwrkj5jSVFamZmA8q/BDYzyygnADOzjHICMDPLKCcAM7OMcgIwM8soJwAzs4xyAjAzyygnADOzjHICMDPLKCcAM7OMcgIwM8soJwAzs4xyAjAzy6hUdwM1s+pQ1/zkMWUdI68/an5a/RlHzW+at2lQY7Khy0cAZmYZ5QRgZpZRTgBmZhnlBGBmllFOAGZmGeUEYGaWUakSgKSZkrZKapfUXKReku5P6l+QdEFeXYekTZLaJLXmlZ8q6WlJ25L3sQOzSWZmlkavCUDSCOABYBZwDnCdpHMKms0CpiavJmBxQf0XI6IhIhrzypqBZyJiKvBMMm9mZmWS5ghgOtAeEa9ExPvAo8CcgjZzgJ9HzvPAKZJqe1nuHGB5Mr0cuDp92GZm1l9pEsAkYEfefGdSlrZNAGskbZDUlNfmtIjYDZC8TywlcDMz6580t4JQkbIooc1FEbFL0kTgaUkvR8S6tAEmSaMJ4IwzzuiltZmZpZXmCKATmJI3PxnYlbZNRBx+3wusIjekBLDn8DBR8r632MojYmlENEZE44QJE1KEa2ZmaaRJAOuBqZLqJR0PzAVWF7RZDXwjuRpoBnAgInZLOknSGABJJwGXA5vz+sxLpucBT/RzW8zMrAS9DgFFxCFJ84GngBHAsojYIunWpH4J0ALMBtqBt4Gbku6nAaskHV7Xioj4z6RuEbBS0s3AX4CvDdhWmZlZr1LdDjoiWsh9yeeXLcmbDuC2Iv1eAc7rZpn7gEtLCdbMzAaOfwlsZpZRTgBmZhnlBGBmllFOAGZmGeVnAptVmWnLpx0172f62mBxAjAro8KHuncsurJCkZg5AZhV1sKTjy2r9y1PrDx8DsDMLKOcAMzMMsoJwMwso5wAzMwyygnAzCyjnADMzDLKCcDMLKOcAMzMMsoJwMwso5wAzMwyygnAzCyjnADMzDLKCcDMLKOcAMzMMipVApA0U9JWSe2SmovUS9L9Sf0Lki5IyqdIWivpJUlbJN2R12ehpJ2S2pLX7IHbLDMz602vzwOQNAJ4ALgM6ATWS1odES/mNZsFTE1eFwKLk/dDwF0RsVHSGGCDpKfz+v40In48cJtjZmZppXkgzHSgPSJeAZD0KDAHyE8Ac4CfR0QAz0s6RVJtROwGdgNExEFJLwGTCvqaDRnHPNFr5PXHtJlW8EAXP9LRqlWaIaBJwI68+c6krKQ2kuqA84E/5hXPT4aMlkkaW2zlkpoktUpq7erqShGumZmlkSYBqEhZlNJG0mjgceDOiHgzKV4MnAk0kDtK+EmxlUfE0ohojIjGCRMmpAjXzMzSSJMAOoEpefOTgV1p20iqIffl/3BE/Opwg4jYExEfRsRHwEPkhprMzKxM0iSA9cBUSfWSjgfmAqsL2qwGvpFcDTQDOBARuyUJ+BnwUkTcm99BUm3e7DXA5j5vhZmZlazXk8ARcUjSfOApYASwLCK2SLo1qV8CtACzgXbgbeCmpPtFwI3AJkltSdk/R0QL8CNJDeSGijqAWwZom8zMLIU0VwGRfGG3FJQtyZsO4LYi/X5P8fMDRMSNJUVqZmYDyr8ENjPLKCcAM7OMcgIwM8uoVOcAzMxs4PT6i/KFB8oShxOAmVmVmbZ82jFlg3FLEQ8BmZlllBOAmVlGeQjIMqW3sdfCO3mC7+Zpw5ePAMzMMsoJwMwso5wAzMwyygnAzCyjnADMzDLKCcDMLKOcAMzMMsq/A7Aho/Aafuj9On5fw2/WPR8BmJlllBOAmVlGOQGYmWWUE4CZWUb5JLCVjU/imlWXVEcAkmZK2iqpXVJzkXpJuj+pf0HSBb31lXSqpKclbUvexw7MJpmZWRq9HgFIGgE8AFwGdALrJa2OiBfzms0CpiavC4HFwIW99G0GnomIRUliaAbuHrhNs4HW62Ps8F/wZkNJmiGg6UB7RLwCIOlRYA6QnwDmAD+PiACel3SKpFqgroe+c4BLkv7LgWdxAhhU/gI3s3zKfWf30ED6KjAzIv5HMn8jcGFEzM9r82tgUUT8Ppl/htyXeV13fSXtj4hT8pbx14g4ZhhIUhPQlMyeBWztw3aOB17vQ79ycXz9V+0xVnt8UP0xVnt8UL0xfjIiJhQWpjkCUJGywqzRXZs0fXsUEUuBpaX0KSSpNSIa+7OMweT4+q/aY6z2+KD6Y6z2+GBoxJgvzUngTmBK3vxkYFfKNj313ZMME5G8700ftpmZ9VeaBLAemCqpXtLxwFxgdUGb1cA3kquBZgAHImJ3L31XA/OS6XnAE/3cFjMzK0GvQ0ARcUjSfOApYASwLCK2SLo1qV8CtACzgXbgbeCmnvomi14ErJR0M/AX4GsDumVH69cQUhk4vv6r9hirPT6o/hirPT4YGjEe0etJYDMzG558Kwgzs4xyAjAzy6hhkwD6c7uKMsU3RdJaSS9J2iLpjiJtLpF0QFJb8vp+mWPskLQpWXdrkfpK78Oz8vZNm6Q3Jd1Z0Kas+1DSMkl7JW3OK0t1m5PePrODHOP/kvRy8u+4StIp3fTt8TMxiPEtlLQz799xdjd9K7kPf5kXX4ektm76Dvo+7LOIGPIvcieYtwP/ABwP/Bk4p6DNbOA35H6bMAP4Y5ljrAUuSKbHAP9VJMZLgF9XcD92AON7qK/oPizyb/4auR+4VGwfAl8ALgA255X9CGhOppuBf+sm/h4/s4Mc4+XAx5LpfysWY5rPxCDGtxBYkOIzULF9WFD/E+D7ldqHfX0NlyOAI7eriIj3gcO3nMh35HYVEfE8cPh2FWUREbsjYmMyfRB4CZhUrvUPkIruwwKXAtsj4tUKrR+AiFgHvFFQPIfc7U1I3q8u0jXNZ3bQYoyINRFxKJl9ntxvdCqim32YRkX34WGSBFwLPDIY6x5MwyUBTAJ25M13cuyXa5o2ZSGpDjgf+GOR6s9J+rOk30j6VHkjI4A1kjYodwuOQlWzD8n9pqS7/3CV3IcAp0XudzAk7xOLtKmmfflP5I7siuntMzGY5idDVMu6GUarln14MbAnIrZ1U1/Jfdij4ZIA+nO7irKSNBp4HLgzIt4sqN5IbkjjPOB/A/+vzOFdFBEXkLu7622SvlBQXy378HjgKuA/ilRXeh+mVS378l+AQ8DD3TTp7TMxWBYDZwINwG5yQyyFqmIfAtfR81//ldqHvRouCaA/t6soG0k15L78H46IXxXWR8SbEfFWMt0C1EgaX674ImJX8r4XWEXuEDtfxfdhYhawMSL2FFZUeh8m0tzmpOL7UtI84EvA1yMZrC6U4jMxKCJiT0R8GBEfAQ91s95q2IcfA/4R+GV3bSq1D9MYLgmgP7erKItknPBnwEsRcW83bU5P2iFpOrl/n31liu8kSWMOT5M7Sbi5oFlF92Gebv/iquQ+zJPmNidpPrODRtJMcnfsvSoi3u6mTZrPxGDFl39u6Zpu1lvRfZj478DLEdFZrLKS+zCVSp+FHqgXuStU/ovcVQH/kpTdCtyaTIvcw2m2A5uAxjLH93lyh6cvAG3Ja3ZBjPOBLeSuZnge+G9ljO8fkvX+OYmh6vZhEsMocl/oJ+eVVWwfkktEu4EPyP1FejMwDngG2Ja8n5q0/QTQ0tNntowxtpMbPz/8WVxSGGN3n4kyxffvyWfsBXJf6rXVtg+T8v97+LOX17bs+7CvL98Kwswso4bLEJCZmZXICcDMLKOcAMzMMsoJwMwso5wAzMwyygnAzCyjnADMzDLq/wOmPtTTe37WFwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure()\n",
    "max_len = max([len(idx) for idx in shuffle_split.values()])\n",
    "arr = []\n",
    "for idx in shuffle_split.values():\n",
    "    this_arr = np.nan + np.zeros(max_len)\n",
    "    this_arr[:len(idx)] = np.array(num_atoms)[idx]\n",
    "    arr.append(this_arr)\n",
    "arr = np.stack(arr, axis=1)\n",
    "\n",
    "plt.hist(arr, stacked=False, bins=np.arange(0, np.nanmax(arr)), log=False, density=True)\n",
    "plt.legend(shuffle_split.keys())"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "96a7dde498302288fd342a0dd7e7c4e017c4083a8e5fac7570734c90ace78c7f"
  },
  "kernelspec": {
   "display_name": "Python 3.8.10 ('goli')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
