{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import os\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%config InlineBackend.figure_format='retina'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>protein</th>\n",
       "      <th>sequence</th>\n",
       "      <th>start_pos</th>\n",
       "      <th>epi_len</th>\n",
       "      <th>entropy</th>\n",
       "      <th>perc_mutated</th>\n",
       "      <th>glyco_probs</th>\n",
       "      <th>crosses_cleavage</th>\n",
       "      <th>sequence_length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>E</td>\n",
       "      <td>YSFVSEET</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>E</td>\n",
       "      <td>SFVSEETG</td>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>E</td>\n",
       "      <td>FVSEETGT</td>\n",
       "      <td>3</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>VSEETGTL</td>\n",
       "      <td>4</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164781</th>\n",
       "      <td>S2</td>\n",
       "      <td>KGCCSCGSCCKFDEDDSEPVLKGVK</td>\n",
       "      <td>1244</td>\n",
       "      <td>25</td>\n",
       "      <td>0.078112</td>\n",
       "      <td>0.007464</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164782</th>\n",
       "      <td>S2</td>\n",
       "      <td>GCCSCGSCCKFDEDDSEPVLKGVKL</td>\n",
       "      <td>1245</td>\n",
       "      <td>25</td>\n",
       "      <td>0.078112</td>\n",
       "      <td>0.007464</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164783</th>\n",
       "      <td>S2</td>\n",
       "      <td>CCSCGSCCKFDEDDSEPVLKGVKLH</td>\n",
       "      <td>1246</td>\n",
       "      <td>25</td>\n",
       "      <td>0.078112</td>\n",
       "      <td>0.007464</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164784</th>\n",
       "      <td>S2</td>\n",
       "      <td>CSCGSCCKFDEDDSEPVLKGVKLHY</td>\n",
       "      <td>1247</td>\n",
       "      <td>25</td>\n",
       "      <td>0.075207</td>\n",
       "      <td>0.007251</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164785</th>\n",
       "      <td>S2</td>\n",
       "      <td>SCGSCCKFDEDDSEPVLKGVKLHYT</td>\n",
       "      <td>1248</td>\n",
       "      <td>25</td>\n",
       "      <td>0.078112</td>\n",
       "      <td>0.007464</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>164786 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       protein                   sequence  start_pos  epi_len   entropy  \\\n",
       "0            E                   MYSFVSEE          0        8  0.002908   \n",
       "1            E                   YSFVSEET          1        8  0.008297   \n",
       "2            E                   SFVSEETG          2        8  0.008297   \n",
       "3            E                   FVSEETGT          3        8  0.008297   \n",
       "4            E                   VSEETGTL          4        8  0.008297   \n",
       "...        ...                        ...        ...      ...       ...   \n",
       "164781      S2  KGCCSCGSCCKFDEDDSEPVLKGVK       1244       25  0.078112   \n",
       "164782      S2  GCCSCGSCCKFDEDDSEPVLKGVKL       1245       25  0.078112   \n",
       "164783      S2  CCSCGSCCKFDEDDSEPVLKGVKLH       1246       25  0.078112   \n",
       "164784      S2  CSCGSCCKFDEDDSEPVLKGVKLHY       1247       25  0.075207   \n",
       "164785      S2  SCGSCCKFDEDDSEPVLKGVKLHYT       1248       25  0.078112   \n",
       "\n",
       "        perc_mutated  glyco_probs  crosses_cleavage  sequence_length  \n",
       "0           0.000213          0.0                 0                8  \n",
       "1           0.000640          0.0                 0                8  \n",
       "2           0.000640          0.0                 0                8  \n",
       "3           0.000640          0.0                 0                8  \n",
       "4           0.000640          0.0                 0                8  \n",
       "...              ...          ...               ...              ...  \n",
       "164781      0.007464          0.0                 0               25  \n",
       "164782      0.007464          0.0                 0               25  \n",
       "164783      0.007464          0.0                 0               25  \n",
       "164784      0.007251          0.0                 0               25  \n",
       "164785      0.007464          0.0                 0               25  \n",
       "\n",
       "[164786 rows x 9 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "orig_data = pd.read_csv('../AllEpitopeFeatures.csv')\n",
    "orig_data = orig_data.rename(columns={'epitope': 'sequence'})\n",
    "orig_data['sequence_length'] = [len(x) for x in orig_data['sequence'].values]\n",
    "orig_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "29406"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(orig_data['sequence'].loc[orig_data['epi_len'].isin([8,9,10])].values.tolist()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>allele</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HLA-B44:05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HLA-B44:07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HLA-A30:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HLA-B44:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>225</th>\n",
       "      <td>HLA-B55:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>226</th>\n",
       "      <td>HLA-B67:01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227</th>\n",
       "      <td>HLA-A24:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228</th>\n",
       "      <td>HLA-B15:32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229</th>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>230 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         allele\n",
       "0    HLA-B44:04\n",
       "1    HLA-B44:05\n",
       "2    HLA-B44:07\n",
       "3    HLA-A30:10\n",
       "4    HLA-B44:02\n",
       "..          ...\n",
       "225  HLA-B55:02\n",
       "226  HLA-B67:01\n",
       "227  HLA-A24:10\n",
       "228  HLA-B15:32\n",
       "229  HLA-B56:10\n",
       "\n",
       "[230 rows x 1 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load final set of HLA alleles.\n",
    "hla_alleles = pd.read_csv('MHC1_allele_mary_cleaned.txt', names=['allele'])\n",
    "hla_alleles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>protein</th>\n",
       "      <th>sequence</th>\n",
       "      <th>start_pos</th>\n",
       "      <th>epi_len</th>\n",
       "      <th>entropy</th>\n",
       "      <th>perc_mutated</th>\n",
       "      <th>glyco_probs</th>\n",
       "      <th>crosses_cleavage</th>\n",
       "      <th>sequence_length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>E</td>\n",
       "      <td>YSFVSEET</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>E</td>\n",
       "      <td>SFVSEETG</td>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>E</td>\n",
       "      <td>FVSEETGT</td>\n",
       "      <td>3</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>VSEETGTL</td>\n",
       "      <td>4</td>\n",
       "      <td>8</td>\n",
       "      <td>0.008297</td>\n",
       "      <td>0.000640</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147203</th>\n",
       "      <td>S2</td>\n",
       "      <td>DSEPVLKGVK</td>\n",
       "      <td>1259</td>\n",
       "      <td>10</td>\n",
       "      <td>0.056306</td>\n",
       "      <td>0.005758</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147204</th>\n",
       "      <td>S2</td>\n",
       "      <td>SEPVLKGVKL</td>\n",
       "      <td>1260</td>\n",
       "      <td>10</td>\n",
       "      <td>0.053400</td>\n",
       "      <td>0.005545</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147205</th>\n",
       "      <td>S2</td>\n",
       "      <td>EPVLKGVKLH</td>\n",
       "      <td>1261</td>\n",
       "      <td>10</td>\n",
       "      <td>0.053400</td>\n",
       "      <td>0.005545</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147206</th>\n",
       "      <td>S2</td>\n",
       "      <td>PVLKGVKLHY</td>\n",
       "      <td>1262</td>\n",
       "      <td>10</td>\n",
       "      <td>0.053400</td>\n",
       "      <td>0.005545</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147207</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29406 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       protein    sequence  start_pos  epi_len   entropy  perc_mutated  \\\n",
       "0            E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "1            E    YSFVSEET          1        8  0.008297      0.000640   \n",
       "2            E    SFVSEETG          2        8  0.008297      0.000640   \n",
       "3            E    FVSEETGT          3        8  0.008297      0.000640   \n",
       "4            E    VSEETGTL          4        8  0.008297      0.000640   \n",
       "...        ...         ...        ...      ...       ...           ...   \n",
       "147203      S2  DSEPVLKGVK       1259       10  0.056306      0.005758   \n",
       "147204      S2  SEPVLKGVKL       1260       10  0.053400      0.005545   \n",
       "147205      S2  EPVLKGVKLH       1261       10  0.053400      0.005545   \n",
       "147206      S2  PVLKGVKLHY       1262       10  0.053400      0.005545   \n",
       "147207      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "\n",
       "        glyco_probs  crosses_cleavage  sequence_length  \n",
       "0               0.0                 0                8  \n",
       "1               0.0                 0                8  \n",
       "2               0.0                 0                8  \n",
       "3               0.0                 0                8  \n",
       "4               0.0                 0                8  \n",
       "...             ...               ...              ...  \n",
       "147203          0.0                 0               10  \n",
       "147204          0.0                 0               10  \n",
       "147205          0.0                 0               10  \n",
       "147206          0.0                 0               10  \n",
       "147207          0.0                 0               10  \n",
       "\n",
       "[29406 rows x 9 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Filter MHC-1 sequences to epitopes with sequence length 8-10 (inclusive).\n",
    "mhc1_data = orig_data.loc[orig_data['sequence_length'].isin([8, 9, 10])]\n",
    "mhc1_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write peptides out (unpaired with MHC) for NetMHCpan.\n",
    "mhc1_data[['sequence']].to_csv('peptides_8-10.pep', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create commands for running NetMHCpan4.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Commands:  35\n"
     ]
    }
   ],
   "source": [
    "cmd_template = '-BA -p peptides_8-10.pep -a {allele} -xls -xlsfile {allele_file}'\n",
    "cmds = []\n",
    "for allele in hla_alleles['allele'].values:\n",
    "    allele_file = 'netmhc_preds/%s_preds.xls' % (allele.replace(':', ''))\n",
    "    # Check if allele file exists.\n",
    "    if os.path.exists(allele_file):\n",
    "        continue\n",
    "    cmd = cmd_template.format(\n",
    "        allele=allele,\n",
    "        allele_file=allele_file,\n",
    "    )\n",
    "    cmds.append(cmd)\n",
    "print('# Commands: ', len(cmds))\n",
    "\n",
    "with open('netmhc_args.txt', 'w') as f:\n",
    "    for cmd in cmds:\n",
    "        f.write(cmd+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Commands:  230\n"
     ]
    }
   ],
   "source": [
    "# For NetMHCpan4.1:\n",
    "\n",
    "cmd_template = '-BA -p peptides_8-10.pep -a {allele} -xls -xlsfile {allele_file}'\n",
    "cmds = []\n",
    "for allele in hla_alleles['allele'].values:\n",
    "    allele_file = 'netmhc-4.1_preds/%s_preds.xls' % (allele.replace(':', ''))\n",
    "    # Check if allele file exists.\n",
    "    if os.path.exists(allele_file):\n",
    "        continue\n",
    "    cmd = cmd_template.format(\n",
    "        allele=allele,\n",
    "        allele_file=allele_file,\n",
    "    )\n",
    "    cmds.append(cmd)\n",
    "print('# Commands: ', len(cmds))\n",
    "\n",
    "with open('netmhc-4.1_args.txt', 'w') as f:\n",
    "    for cmd in cmds:\n",
    "        f.write(cmd+'\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Command to run:    \n",
    "```\n",
    "cat netmhc_args.txt | xargs -P 40 -d '\\n' -n 1 ./netMHCpan-4.0/netMHCpan\n",
    "\n",
    "cat netmhc-4.1_args.txt | xargs -P 35 -d '\\n' -n 1 ./netMHCpan-4.1/netMHCpan\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load NetMHCpan4.0 predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Peptide</th>\n",
       "      <th>1-log50k</th>\n",
       "      <th>nM</th>\n",
       "      <th>genotype</th>\n",
       "      <th>sequence_length</th>\n",
       "      <th>loci</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0.0187</td>\n",
       "      <td>40836.4258</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>YSFVSEET</td>\n",
       "      <td>0.0136</td>\n",
       "      <td>43144.7188</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SFVSEETG</td>\n",
       "      <td>0.0114</td>\n",
       "      <td>44191.7070</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>FVSEETGT</td>\n",
       "      <td>0.0075</td>\n",
       "      <td>46105.8516</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>VSEETGTL</td>\n",
       "      <td>0.0146</td>\n",
       "      <td>42674.8945</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29398</th>\n",
       "      <td>DSEPVLKGVK</td>\n",
       "      <td>0.0252</td>\n",
       "      <td>38063.9297</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29399</th>\n",
       "      <td>SEPVLKGVKL</td>\n",
       "      <td>0.3105</td>\n",
       "      <td>1738.0863</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29400</th>\n",
       "      <td>EPVLKGVKLH</td>\n",
       "      <td>0.1051</td>\n",
       "      <td>16034.5420</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29401</th>\n",
       "      <td>PVLKGVKLHY</td>\n",
       "      <td>0.0223</td>\n",
       "      <td>39286.5586</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29402</th>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>0.0272</td>\n",
       "      <td>37267.6289</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          Peptide  1-log50k          nM    genotype  sequence_length   loci\n",
       "0        MYSFVSEE    0.0187  40836.4258  HLA-B44:04                8  HLA-B\n",
       "1        YSFVSEET    0.0136  43144.7188  HLA-B44:04                8  HLA-B\n",
       "2        SFVSEETG    0.0114  44191.7070  HLA-B44:04                8  HLA-B\n",
       "3        FVSEETGT    0.0075  46105.8516  HLA-B44:04                8  HLA-B\n",
       "4        VSEETGTL    0.0146  42674.8945  HLA-B44:04                8  HLA-B\n",
       "...           ...       ...         ...         ...              ...    ...\n",
       "29398  DSEPVLKGVK    0.0252  38063.9297  HLA-B56:10               10  HLA-B\n",
       "29399  SEPVLKGVKL    0.3105   1738.0863  HLA-B56:10               10  HLA-B\n",
       "29400  EPVLKGVKLH    0.1051  16034.5420  HLA-B56:10               10  HLA-B\n",
       "29401  PVLKGVKLHY    0.0223  39286.5586  HLA-B56:10               10  HLA-B\n",
       "29402  VLKGVKLHYT    0.0272  37267.6289  HLA-B56:10               10  HLA-B\n",
       "\n",
       "[6762690 rows x 6 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs = []\n",
    "for allele in hla_alleles['allele'].values:\n",
    "    df = pd.read_csv(\n",
    "        './netmhc_preds/%s_preds.xls' % allele.replace(':', ''),\n",
    "        delimiter='\\t',\n",
    "        skiprows=[0],\n",
    "    )\n",
    "    df['genotype'] = allele\n",
    "    df = df.drop(columns=['Pos', 'ID', 'core', 'icore', 'Rank', 'Ave', 'NB'])\n",
    "    dfs.append(df)\n",
    "netmhc1_data = pd.concat(dfs)\n",
    "netmhc1_data['sequence_length'] = [len(x) for x in netmhc1_data['Peptide'].values]\n",
    "netmhc1_data['loci'] = [x[:5] for x in netmhc1_data['genotype'].values]\n",
    "netmhc1_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "a74 = netmhc1_data.loc[netmhc1_data['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])\n",
    "a74['loci'] = 'HLA-A'\n",
    "a74['genotype'] = 'HLA-A74'\n",
    "#a74\n",
    "\n",
    "c17 = netmhc1_data.loc[netmhc1_data['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()\n",
    "c17['loci'] = 'HLA-C'\n",
    "c17['genotype'] = 'HLA-C17'\n",
    "#c17\n",
    "\n",
    "c18 = netmhc1_data.loc[netmhc1_data['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()\n",
    "c18['loci'] = 'HLA-C'\n",
    "c18['genotype'] = 'HLA-C18'\n",
    "#c18"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Peptide</th>\n",
       "      <th>1-log50k</th>\n",
       "      <th>nM</th>\n",
       "      <th>genotype</th>\n",
       "      <th>sequence_length</th>\n",
       "      <th>loci</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0.018700</td>\n",
       "      <td>40836.425800</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>YSFVSEET</td>\n",
       "      <td>0.013600</td>\n",
       "      <td>43144.718800</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SFVSEETG</td>\n",
       "      <td>0.011400</td>\n",
       "      <td>44191.707000</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>FVSEETGT</td>\n",
       "      <td>0.007500</td>\n",
       "      <td>46105.851600</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>VSEETGTL</td>\n",
       "      <td>0.014600</td>\n",
       "      <td>42674.894500</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29398</th>\n",
       "      <td>YYVGYLQPR</td>\n",
       "      <td>0.049400</td>\n",
       "      <td>29468.162767</td>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>9</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29399</th>\n",
       "      <td>YYVGYLQPRT</td>\n",
       "      <td>0.039567</td>\n",
       "      <td>32691.988933</td>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29400</th>\n",
       "      <td>YYVWKSYV</td>\n",
       "      <td>0.089833</td>\n",
       "      <td>18992.130200</td>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29401</th>\n",
       "      <td>YYVWKSYVH</td>\n",
       "      <td>0.045900</td>\n",
       "      <td>30434.857433</td>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>9</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29402</th>\n",
       "      <td>YYVWKSYVHV</td>\n",
       "      <td>0.096067</td>\n",
       "      <td>17887.859067</td>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6850899 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          Peptide  1-log50k            nM    genotype  sequence_length   loci\n",
       "0        MYSFVSEE  0.018700  40836.425800  HLA-B44:04                8  HLA-B\n",
       "1        YSFVSEET  0.013600  43144.718800  HLA-B44:04                8  HLA-B\n",
       "2        SFVSEETG  0.011400  44191.707000  HLA-B44:04                8  HLA-B\n",
       "3        FVSEETGT  0.007500  46105.851600  HLA-B44:04                8  HLA-B\n",
       "4        VSEETGTL  0.014600  42674.894500  HLA-B44:04                8  HLA-B\n",
       "...           ...       ...           ...         ...              ...    ...\n",
       "29398   YYVGYLQPR  0.049400  29468.162767     HLA-C18                9  HLA-C\n",
       "29399  YYVGYLQPRT  0.039567  32691.988933     HLA-C18               10  HLA-C\n",
       "29400    YYVWKSYV  0.089833  18992.130200     HLA-C18                8  HLA-C\n",
       "29401   YYVWKSYVH  0.045900  30434.857433     HLA-C18                9  HLA-C\n",
       "29402  YYVWKSYVHV  0.096067  17887.859067     HLA-C18               10  HLA-C\n",
       "\n",
       "[6850899 rows x 6 columns]"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([netmhc1_data, a74, c17, c18], sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>loci</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-A</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-C</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genotype</th>\n",
       "      <th>HLA-A01:01</th>\n",
       "      <th>HLA-A01:02</th>\n",
       "      <th>HLA-A01:03</th>\n",
       "      <th>HLA-A01:09</th>\n",
       "      <th>HLA-A01:23</th>\n",
       "      <th>HLA-A02:01</th>\n",
       "      <th>HLA-A02:02</th>\n",
       "      <th>HLA-A02:03</th>\n",
       "      <th>HLA-A02:04</th>\n",
       "      <th>HLA-A02:05</th>\n",
       "      <th>...</th>\n",
       "      <th>HLA-C17:02</th>\n",
       "      <th>HLA-C17:03</th>\n",
       "      <th>HLA-C17:04</th>\n",
       "      <th>HLA-C17:05</th>\n",
       "      <th>HLA-C17:06</th>\n",
       "      <th>HLA-C17:07</th>\n",
       "      <th>HLA-C18</th>\n",
       "      <th>HLA-C18:01</th>\n",
       "      <th>HLA-C18:02</th>\n",
       "      <th>HLA-C18:03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peptide</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAAYYVGY</th>\n",
       "      <td>0.1054</td>\n",
       "      <td>0.1506</td>\n",
       "      <td>0.0860</td>\n",
       "      <td>0.1054</td>\n",
       "      <td>0.1103</td>\n",
       "      <td>0.0291</td>\n",
       "      <td>0.0483</td>\n",
       "      <td>0.0614</td>\n",
       "      <td>0.0209</td>\n",
       "      <td>0.0601</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0291</td>\n",
       "      <td>0.0291</td>\n",
       "      <td>0.0291</td>\n",
       "      <td>0.0291</td>\n",
       "      <td>0.0291</td>\n",
       "      <td>0.0240</td>\n",
       "      <td>0.015933</td>\n",
       "      <td>0.0166</td>\n",
       "      <td>0.0166</td>\n",
       "      <td>0.0146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYL</th>\n",
       "      <td>0.0739</td>\n",
       "      <td>0.0925</td>\n",
       "      <td>0.0669</td>\n",
       "      <td>0.0739</td>\n",
       "      <td>0.0789</td>\n",
       "      <td>0.2276</td>\n",
       "      <td>0.4786</td>\n",
       "      <td>0.4725</td>\n",
       "      <td>0.1623</td>\n",
       "      <td>0.5121</td>\n",
       "      <td>...</td>\n",
       "      <td>0.3401</td>\n",
       "      <td>0.3401</td>\n",
       "      <td>0.3401</td>\n",
       "      <td>0.3401</td>\n",
       "      <td>0.3401</td>\n",
       "      <td>0.2976</td>\n",
       "      <td>0.079467</td>\n",
       "      <td>0.0681</td>\n",
       "      <td>0.0681</td>\n",
       "      <td>0.1022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYLQ</th>\n",
       "      <td>0.0547</td>\n",
       "      <td>0.0909</td>\n",
       "      <td>0.0431</td>\n",
       "      <td>0.0547</td>\n",
       "      <td>0.0508</td>\n",
       "      <td>0.0655</td>\n",
       "      <td>0.1775</td>\n",
       "      <td>0.1745</td>\n",
       "      <td>0.0445</td>\n",
       "      <td>0.1913</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0507</td>\n",
       "      <td>0.0507</td>\n",
       "      <td>0.0507</td>\n",
       "      <td>0.0507</td>\n",
       "      <td>0.0507</td>\n",
       "      <td>0.0429</td>\n",
       "      <td>0.020367</td>\n",
       "      <td>0.0187</td>\n",
       "      <td>0.0187</td>\n",
       "      <td>0.0237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAK</th>\n",
       "      <td>0.0345</td>\n",
       "      <td>0.0760</td>\n",
       "      <td>0.0265</td>\n",
       "      <td>0.0345</td>\n",
       "      <td>0.0342</td>\n",
       "      <td>0.0286</td>\n",
       "      <td>0.0299</td>\n",
       "      <td>0.0342</td>\n",
       "      <td>0.0229</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0168</td>\n",
       "      <td>0.0168</td>\n",
       "      <td>0.0168</td>\n",
       "      <td>0.0168</td>\n",
       "      <td>0.0168</td>\n",
       "      <td>0.0127</td>\n",
       "      <td>0.016200</td>\n",
       "      <td>0.0166</td>\n",
       "      <td>0.0166</td>\n",
       "      <td>0.0154</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAKA</th>\n",
       "      <td>0.0460</td>\n",
       "      <td>0.0753</td>\n",
       "      <td>0.0377</td>\n",
       "      <td>0.0460</td>\n",
       "      <td>0.0474</td>\n",
       "      <td>0.1655</td>\n",
       "      <td>0.2493</td>\n",
       "      <td>0.3349</td>\n",
       "      <td>0.0895</td>\n",
       "      <td>0.3377</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0675</td>\n",
       "      <td>0.0675</td>\n",
       "      <td>0.0675</td>\n",
       "      <td>0.0675</td>\n",
       "      <td>0.0675</td>\n",
       "      <td>0.0423</td>\n",
       "      <td>0.029500</td>\n",
       "      <td>0.0276</td>\n",
       "      <td>0.0276</td>\n",
       "      <td>0.0333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPR</th>\n",
       "      <td>0.0289</td>\n",
       "      <td>0.0812</td>\n",
       "      <td>0.0259</td>\n",
       "      <td>0.0289</td>\n",
       "      <td>0.0367</td>\n",
       "      <td>0.0993</td>\n",
       "      <td>0.1255</td>\n",
       "      <td>0.1105</td>\n",
       "      <td>0.0972</td>\n",
       "      <td>0.1698</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0293</td>\n",
       "      <td>0.0293</td>\n",
       "      <td>0.0293</td>\n",
       "      <td>0.0293</td>\n",
       "      <td>0.0293</td>\n",
       "      <td>0.0237</td>\n",
       "      <td>0.049400</td>\n",
       "      <td>0.0424</td>\n",
       "      <td>0.0424</td>\n",
       "      <td>0.0634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPRT</th>\n",
       "      <td>0.0303</td>\n",
       "      <td>0.0544</td>\n",
       "      <td>0.0252</td>\n",
       "      <td>0.0303</td>\n",
       "      <td>0.0325</td>\n",
       "      <td>0.1111</td>\n",
       "      <td>0.1629</td>\n",
       "      <td>0.1631</td>\n",
       "      <td>0.0923</td>\n",
       "      <td>0.2180</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0270</td>\n",
       "      <td>0.0270</td>\n",
       "      <td>0.0270</td>\n",
       "      <td>0.0270</td>\n",
       "      <td>0.0270</td>\n",
       "      <td>0.0197</td>\n",
       "      <td>0.039567</td>\n",
       "      <td>0.0343</td>\n",
       "      <td>0.0343</td>\n",
       "      <td>0.0501</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYV</th>\n",
       "      <td>0.0345</td>\n",
       "      <td>0.0552</td>\n",
       "      <td>0.0313</td>\n",
       "      <td>0.0345</td>\n",
       "      <td>0.0367</td>\n",
       "      <td>0.0909</td>\n",
       "      <td>0.1175</td>\n",
       "      <td>0.1459</td>\n",
       "      <td>0.0811</td>\n",
       "      <td>0.1130</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0513</td>\n",
       "      <td>0.0513</td>\n",
       "      <td>0.0513</td>\n",
       "      <td>0.0513</td>\n",
       "      <td>0.0513</td>\n",
       "      <td>0.0456</td>\n",
       "      <td>0.089833</td>\n",
       "      <td>0.0838</td>\n",
       "      <td>0.0838</td>\n",
       "      <td>0.1019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVH</th>\n",
       "      <td>0.0480</td>\n",
       "      <td>0.1032</td>\n",
       "      <td>0.0413</td>\n",
       "      <td>0.0480</td>\n",
       "      <td>0.0535</td>\n",
       "      <td>0.0274</td>\n",
       "      <td>0.0373</td>\n",
       "      <td>0.0272</td>\n",
       "      <td>0.0254</td>\n",
       "      <td>0.0467</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0185</td>\n",
       "      <td>0.045900</td>\n",
       "      <td>0.0467</td>\n",
       "      <td>0.0467</td>\n",
       "      <td>0.0443</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVHV</th>\n",
       "      <td>0.0556</td>\n",
       "      <td>0.0904</td>\n",
       "      <td>0.0514</td>\n",
       "      <td>0.0556</td>\n",
       "      <td>0.0656</td>\n",
       "      <td>0.5261</td>\n",
       "      <td>0.4508</td>\n",
       "      <td>0.4825</td>\n",
       "      <td>0.3988</td>\n",
       "      <td>0.4086</td>\n",
       "      <td>...</td>\n",
       "      <td>0.1086</td>\n",
       "      <td>0.1086</td>\n",
       "      <td>0.1086</td>\n",
       "      <td>0.1086</td>\n",
       "      <td>0.1086</td>\n",
       "      <td>0.0854</td>\n",
       "      <td>0.096067</td>\n",
       "      <td>0.0859</td>\n",
       "      <td>0.0859</td>\n",
       "      <td>0.1164</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29403 rows × 233 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "loci            HLA-A                                                         \\\n",
       "genotype   HLA-A01:01 HLA-A01:02 HLA-A01:03 HLA-A01:09 HLA-A01:23 HLA-A02:01   \n",
       "Peptide                                                                        \n",
       "AAAYYVGY       0.1054     0.1506     0.0860     0.1054     0.1103     0.0291   \n",
       "AAAYYVGYL      0.0739     0.0925     0.0669     0.0739     0.0789     0.2276   \n",
       "AAAYYVGYLQ     0.0547     0.0909     0.0431     0.0547     0.0508     0.0655   \n",
       "AACCHLAK       0.0345     0.0760     0.0265     0.0345     0.0342     0.0286   \n",
       "AACCHLAKA      0.0460     0.0753     0.0377     0.0460     0.0474     0.1655   \n",
       "...               ...        ...        ...        ...        ...        ...   \n",
       "YYVGYLQPR      0.0289     0.0812     0.0259     0.0289     0.0367     0.0993   \n",
       "YYVGYLQPRT     0.0303     0.0544     0.0252     0.0303     0.0325     0.1111   \n",
       "YYVWKSYV       0.0345     0.0552     0.0313     0.0345     0.0367     0.0909   \n",
       "YYVWKSYVH      0.0480     0.1032     0.0413     0.0480     0.0535     0.0274   \n",
       "YYVWKSYVHV     0.0556     0.0904     0.0514     0.0556     0.0656     0.5261   \n",
       "\n",
       "loci                                                    ...      HLA-C  \\\n",
       "genotype   HLA-A02:02 HLA-A02:03 HLA-A02:04 HLA-A02:05  ... HLA-C17:02   \n",
       "Peptide                                                 ...              \n",
       "AAAYYVGY       0.0483     0.0614     0.0209     0.0601  ...     0.0291   \n",
       "AAAYYVGYL      0.4786     0.4725     0.1623     0.5121  ...     0.3401   \n",
       "AAAYYVGYLQ     0.1775     0.1745     0.0445     0.1913  ...     0.0507   \n",
       "AACCHLAK       0.0299     0.0342     0.0229     0.0394  ...     0.0168   \n",
       "AACCHLAKA      0.2493     0.3349     0.0895     0.3377  ...     0.0675   \n",
       "...               ...        ...        ...        ...  ...        ...   \n",
       "YYVGYLQPR      0.1255     0.1105     0.0972     0.1698  ...     0.0293   \n",
       "YYVGYLQPRT     0.1629     0.1631     0.0923     0.2180  ...     0.0270   \n",
       "YYVWKSYV       0.1175     0.1459     0.0811     0.1130  ...     0.0513   \n",
       "YYVWKSYVH      0.0373     0.0272     0.0254     0.0467  ...     0.0180   \n",
       "YYVWKSYVHV     0.4508     0.4825     0.3988     0.4086  ...     0.1086   \n",
       "\n",
       "loci                                                                         \\\n",
       "genotype   HLA-C17:03 HLA-C17:04 HLA-C17:05 HLA-C17:06 HLA-C17:07   HLA-C18   \n",
       "Peptide                                                                       \n",
       "AAAYYVGY       0.0291     0.0291     0.0291     0.0291     0.0240  0.015933   \n",
       "AAAYYVGYL      0.3401     0.3401     0.3401     0.3401     0.2976  0.079467   \n",
       "AAAYYVGYLQ     0.0507     0.0507     0.0507     0.0507     0.0429  0.020367   \n",
       "AACCHLAK       0.0168     0.0168     0.0168     0.0168     0.0127  0.016200   \n",
       "AACCHLAKA      0.0675     0.0675     0.0675     0.0675     0.0423  0.029500   \n",
       "...               ...        ...        ...        ...        ...       ...   \n",
       "YYVGYLQPR      0.0293     0.0293     0.0293     0.0293     0.0237  0.049400   \n",
       "YYVGYLQPRT     0.0270     0.0270     0.0270     0.0270     0.0197  0.039567   \n",
       "YYVWKSYV       0.0513     0.0513     0.0513     0.0513     0.0456  0.089833   \n",
       "YYVWKSYVH      0.0180     0.0180     0.0180     0.0180     0.0185  0.045900   \n",
       "YYVWKSYVHV     0.1086     0.1086     0.1086     0.1086     0.0854  0.096067   \n",
       "\n",
       "loci                                         \n",
       "genotype   HLA-C18:01 HLA-C18:02 HLA-C18:03  \n",
       "Peptide                                      \n",
       "AAAYYVGY       0.0166     0.0166     0.0146  \n",
       "AAAYYVGYL      0.0681     0.0681     0.1022  \n",
       "AAAYYVGYLQ     0.0187     0.0187     0.0237  \n",
       "AACCHLAK       0.0166     0.0166     0.0154  \n",
       "AACCHLAKA      0.0276     0.0276     0.0333  \n",
       "...               ...        ...        ...  \n",
       "YYVGYLQPR      0.0424     0.0424     0.0634  \n",
       "YYVGYLQPRT     0.0343     0.0343     0.0501  \n",
       "YYVWKSYV       0.0838     0.0838     0.1019  \n",
       "YYVWKSYVH      0.0467     0.0467     0.0443  \n",
       "YYVWKSYVHV     0.0859     0.0859     0.1164  \n",
       "\n",
       "[29403 rows x 233 columns]"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_pivot = pd.concat([netmhc1_data, a74, c17, c18], sort=False).pivot_table(\n",
    "    index='Peptide',\n",
    "    columns=['loci', 'genotype'],\n",
    "    values='1-log50k',\n",
    ")\n",
    "data_pivot.to_pickle('mhc1_haplotype_netmhc_pred_affinity_pivot.pkl.gz', protocol=2)\n",
    "data_pivot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load NetMHCpan4.1 Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Peptide</th>\n",
       "      <th>EL-score</th>\n",
       "      <th>EL_Rank</th>\n",
       "      <th>BA-score</th>\n",
       "      <th>BA_Rank</th>\n",
       "      <th>genotype</th>\n",
       "      <th>sequence_length</th>\n",
       "      <th>loci</th>\n",
       "      <th>BA_nM</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>72.5000</td>\n",
       "      <td>0.0238</td>\n",
       "      <td>62.7906</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>38648.666877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>YSFVSEET</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>72.5000</td>\n",
       "      <td>0.0172</td>\n",
       "      <td>81.8807</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>41509.520847</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SFVSEETG</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>72.5000</td>\n",
       "      <td>0.0158</td>\n",
       "      <td>85.7518</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>42143.080553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>FVSEETGT</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>70.0000</td>\n",
       "      <td>0.0100</td>\n",
       "      <td>95.7597</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>44872.503932</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>VSEETGTL</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>31.8889</td>\n",
       "      <td>0.0174</td>\n",
       "      <td>81.5398</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>41419.793203</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29398</th>\n",
       "      <td>DSEPVLKGVK</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>54.4000</td>\n",
       "      <td>0.0164</td>\n",
       "      <td>71.7671</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>41870.379408</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29399</th>\n",
       "      <td>SEPVLKGVKL</td>\n",
       "      <td>0.4757</td>\n",
       "      <td>0.4963</td>\n",
       "      <td>0.3035</td>\n",
       "      <td>1.8215</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>1874.271960</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29400</th>\n",
       "      <td>EPVLKGVKLH</td>\n",
       "      <td>0.0212</td>\n",
       "      <td>5.8240</td>\n",
       "      <td>0.0892</td>\n",
       "      <td>14.2037</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>19046.793791</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29401</th>\n",
       "      <td>PVLKGVKLHY</td>\n",
       "      <td>0.0001</td>\n",
       "      <td>49.6667</td>\n",
       "      <td>0.0162</td>\n",
       "      <td>72.3585</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>41961.083157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29402</th>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>62.2727</td>\n",
       "      <td>0.0224</td>\n",
       "      <td>59.4739</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B</td>\n",
       "      <td>39238.561377</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          Peptide  EL-score  EL_Rank  BA-score  BA_Rank    genotype  \\\n",
       "0        MYSFVSEE    0.0000  72.5000    0.0238  62.7906  HLA-B44:04   \n",
       "1        YSFVSEET    0.0000  72.5000    0.0172  81.8807  HLA-B44:04   \n",
       "2        SFVSEETG    0.0000  72.5000    0.0158  85.7518  HLA-B44:04   \n",
       "3        FVSEETGT    0.0000  70.0000    0.0100  95.7597  HLA-B44:04   \n",
       "4        VSEETGTL    0.0001  31.8889    0.0174  81.5398  HLA-B44:04   \n",
       "...           ...       ...      ...       ...      ...         ...   \n",
       "29398  DSEPVLKGVK    0.0001  54.4000    0.0164  71.7671  HLA-B56:10   \n",
       "29399  SEPVLKGVKL    0.4757   0.4963    0.3035   1.8215  HLA-B56:10   \n",
       "29400  EPVLKGVKLH    0.0212   5.8240    0.0892  14.2037  HLA-B56:10   \n",
       "29401  PVLKGVKLHY    0.0001  49.6667    0.0162  72.3585  HLA-B56:10   \n",
       "29402  VLKGVKLHYT    0.0000  62.2727    0.0224  59.4739  HLA-B56:10   \n",
       "\n",
       "       sequence_length   loci         BA_nM  \n",
       "0                    8  HLA-B  38648.666877  \n",
       "1                    8  HLA-B  41509.520847  \n",
       "2                    8  HLA-B  42143.080553  \n",
       "3                    8  HLA-B  44872.503932  \n",
       "4                    8  HLA-B  41419.793203  \n",
       "...                ...    ...           ...  \n",
       "29398               10  HLA-B  41870.379408  \n",
       "29399               10  HLA-B   1874.271960  \n",
       "29400               10  HLA-B  19046.793791  \n",
       "29401               10  HLA-B  41961.083157  \n",
       "29402               10  HLA-B  39238.561377  \n",
       "\n",
       "[6762690 rows x 9 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs = []\n",
    "for allele in hla_alleles['allele'].values:\n",
    "    df = pd.read_csv(\n",
    "        './netmhc-4.1_preds/%s_preds.xls' % allele.replace(':', ''),\n",
    "        delimiter='\\t',\n",
    "        skiprows=[0],\n",
    "    )\n",
    "    df['genotype'] = allele\n",
    "    df = df.drop(columns=['Pos', 'ID', 'core', 'icore', 'Ave', 'NB'])\n",
    "    dfs.append(df)\n",
    "netmhc41_data = pd.concat(dfs)\n",
    "netmhc41_data['sequence_length'] = [len(x) for x in netmhc41_data['Peptide'].values]\n",
    "netmhc41_data['loci'] = [x[:5] for x in netmhc41_data['genotype'].values]\n",
    "netmhc41_data['BA_nM'] = 50000**(1-netmhc41_data['BA-score'])\n",
    "netmhc41_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "a74 = netmhc41_data.loc[netmhc41_data['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])\n",
    "a74['loci'] = 'HLA-A'\n",
    "a74['genotype'] = 'HLA-A74'\n",
    "#a74\n",
    "\n",
    "c17 = netmhc41_data.loc[netmhc41_data['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()\n",
    "c17['loci'] = 'HLA-C'\n",
    "c17['genotype'] = 'HLA-C17'\n",
    "#c17\n",
    "\n",
    "c18 = netmhc41_data.loc[netmhc41_data['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()\n",
    "c18['loci'] = 'HLA-C'\n",
    "c18['genotype'] = 'HLA-C18'\n",
    "#c18"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>loci</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-A</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-C</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genotype</th>\n",
       "      <th>HLA-A01:01</th>\n",
       "      <th>HLA-A01:02</th>\n",
       "      <th>HLA-A01:03</th>\n",
       "      <th>HLA-A01:09</th>\n",
       "      <th>HLA-A01:23</th>\n",
       "      <th>HLA-A02:01</th>\n",
       "      <th>HLA-A02:02</th>\n",
       "      <th>HLA-A02:03</th>\n",
       "      <th>HLA-A02:04</th>\n",
       "      <th>HLA-A02:05</th>\n",
       "      <th>...</th>\n",
       "      <th>HLA-C17:02</th>\n",
       "      <th>HLA-C17:03</th>\n",
       "      <th>HLA-C17:04</th>\n",
       "      <th>HLA-C17:05</th>\n",
       "      <th>HLA-C17:06</th>\n",
       "      <th>HLA-C17:07</th>\n",
       "      <th>HLA-C18</th>\n",
       "      <th>HLA-C18:01</th>\n",
       "      <th>HLA-C18:02</th>\n",
       "      <th>HLA-C18:03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peptide</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAAYYVGY</th>\n",
       "      <td>0.0967</td>\n",
       "      <td>0.1644</td>\n",
       "      <td>0.0733</td>\n",
       "      <td>0.0967</td>\n",
       "      <td>0.0864</td>\n",
       "      <td>0.0278</td>\n",
       "      <td>0.0401</td>\n",
       "      <td>0.0417</td>\n",
       "      <td>0.0199</td>\n",
       "      <td>0.0495</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>0.0303</td>\n",
       "      <td>0.020300</td>\n",
       "      <td>0.0217</td>\n",
       "      <td>0.0217</td>\n",
       "      <td>0.0175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYL</th>\n",
       "      <td>0.0879</td>\n",
       "      <td>0.1206</td>\n",
       "      <td>0.0863</td>\n",
       "      <td>0.0879</td>\n",
       "      <td>0.0815</td>\n",
       "      <td>0.2249</td>\n",
       "      <td>0.4798</td>\n",
       "      <td>0.4420</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.5578</td>\n",
       "      <td>...</td>\n",
       "      <td>0.5393</td>\n",
       "      <td>0.5393</td>\n",
       "      <td>0.5393</td>\n",
       "      <td>0.5393</td>\n",
       "      <td>0.5393</td>\n",
       "      <td>0.4357</td>\n",
       "      <td>0.110533</td>\n",
       "      <td>0.0931</td>\n",
       "      <td>0.0931</td>\n",
       "      <td>0.1454</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYLQ</th>\n",
       "      <td>0.0527</td>\n",
       "      <td>0.0837</td>\n",
       "      <td>0.0451</td>\n",
       "      <td>0.0527</td>\n",
       "      <td>0.0465</td>\n",
       "      <td>0.0738</td>\n",
       "      <td>0.1754</td>\n",
       "      <td>0.1566</td>\n",
       "      <td>0.0542</td>\n",
       "      <td>0.2006</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0808</td>\n",
       "      <td>0.0808</td>\n",
       "      <td>0.0808</td>\n",
       "      <td>0.0808</td>\n",
       "      <td>0.0808</td>\n",
       "      <td>0.0553</td>\n",
       "      <td>0.028433</td>\n",
       "      <td>0.0266</td>\n",
       "      <td>0.0266</td>\n",
       "      <td>0.0321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAK</th>\n",
       "      <td>0.0420</td>\n",
       "      <td>0.0789</td>\n",
       "      <td>0.0354</td>\n",
       "      <td>0.0420</td>\n",
       "      <td>0.0333</td>\n",
       "      <td>0.0287</td>\n",
       "      <td>0.0302</td>\n",
       "      <td>0.0286</td>\n",
       "      <td>0.0236</td>\n",
       "      <td>0.0363</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0126</td>\n",
       "      <td>0.015900</td>\n",
       "      <td>0.0184</td>\n",
       "      <td>0.0184</td>\n",
       "      <td>0.0109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAKA</th>\n",
       "      <td>0.0535</td>\n",
       "      <td>0.0787</td>\n",
       "      <td>0.0447</td>\n",
       "      <td>0.0535</td>\n",
       "      <td>0.0498</td>\n",
       "      <td>0.1479</td>\n",
       "      <td>0.2237</td>\n",
       "      <td>0.3273</td>\n",
       "      <td>0.0916</td>\n",
       "      <td>0.3419</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0691</td>\n",
       "      <td>0.0691</td>\n",
       "      <td>0.0691</td>\n",
       "      <td>0.0691</td>\n",
       "      <td>0.0691</td>\n",
       "      <td>0.0460</td>\n",
       "      <td>0.038067</td>\n",
       "      <td>0.0366</td>\n",
       "      <td>0.0366</td>\n",
       "      <td>0.0410</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPR</th>\n",
       "      <td>0.0376</td>\n",
       "      <td>0.1285</td>\n",
       "      <td>0.0346</td>\n",
       "      <td>0.0376</td>\n",
       "      <td>0.0438</td>\n",
       "      <td>0.1105</td>\n",
       "      <td>0.1362</td>\n",
       "      <td>0.1120</td>\n",
       "      <td>0.1013</td>\n",
       "      <td>0.1849</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0390</td>\n",
       "      <td>0.0390</td>\n",
       "      <td>0.0390</td>\n",
       "      <td>0.0390</td>\n",
       "      <td>0.0390</td>\n",
       "      <td>0.0276</td>\n",
       "      <td>0.057833</td>\n",
       "      <td>0.0550</td>\n",
       "      <td>0.0550</td>\n",
       "      <td>0.0635</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPRT</th>\n",
       "      <td>0.0351</td>\n",
       "      <td>0.0759</td>\n",
       "      <td>0.0322</td>\n",
       "      <td>0.0351</td>\n",
       "      <td>0.0361</td>\n",
       "      <td>0.1011</td>\n",
       "      <td>0.1517</td>\n",
       "      <td>0.1558</td>\n",
       "      <td>0.0877</td>\n",
       "      <td>0.2004</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0266</td>\n",
       "      <td>0.0266</td>\n",
       "      <td>0.0266</td>\n",
       "      <td>0.0266</td>\n",
       "      <td>0.0266</td>\n",
       "      <td>0.0191</td>\n",
       "      <td>0.047067</td>\n",
       "      <td>0.0413</td>\n",
       "      <td>0.0413</td>\n",
       "      <td>0.0586</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYV</th>\n",
       "      <td>0.0316</td>\n",
       "      <td>0.0701</td>\n",
       "      <td>0.0306</td>\n",
       "      <td>0.0316</td>\n",
       "      <td>0.0310</td>\n",
       "      <td>0.0748</td>\n",
       "      <td>0.1008</td>\n",
       "      <td>0.1146</td>\n",
       "      <td>0.0704</td>\n",
       "      <td>0.1010</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0753</td>\n",
       "      <td>0.0753</td>\n",
       "      <td>0.0753</td>\n",
       "      <td>0.0753</td>\n",
       "      <td>0.0753</td>\n",
       "      <td>0.0576</td>\n",
       "      <td>0.117933</td>\n",
       "      <td>0.1067</td>\n",
       "      <td>0.1067</td>\n",
       "      <td>0.1404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVH</th>\n",
       "      <td>0.0479</td>\n",
       "      <td>0.1382</td>\n",
       "      <td>0.0443</td>\n",
       "      <td>0.0479</td>\n",
       "      <td>0.0526</td>\n",
       "      <td>0.0411</td>\n",
       "      <td>0.0492</td>\n",
       "      <td>0.0324</td>\n",
       "      <td>0.0332</td>\n",
       "      <td>0.0569</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0415</td>\n",
       "      <td>0.0415</td>\n",
       "      <td>0.0415</td>\n",
       "      <td>0.0415</td>\n",
       "      <td>0.0415</td>\n",
       "      <td>0.0372</td>\n",
       "      <td>0.053200</td>\n",
       "      <td>0.0515</td>\n",
       "      <td>0.0515</td>\n",
       "      <td>0.0566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVHV</th>\n",
       "      <td>0.0618</td>\n",
       "      <td>0.1085</td>\n",
       "      <td>0.0609</td>\n",
       "      <td>0.0618</td>\n",
       "      <td>0.0654</td>\n",
       "      <td>0.5143</td>\n",
       "      <td>0.4322</td>\n",
       "      <td>0.4541</td>\n",
       "      <td>0.3955</td>\n",
       "      <td>0.4248</td>\n",
       "      <td>...</td>\n",
       "      <td>0.1365</td>\n",
       "      <td>0.1365</td>\n",
       "      <td>0.1365</td>\n",
       "      <td>0.1365</td>\n",
       "      <td>0.1365</td>\n",
       "      <td>0.1036</td>\n",
       "      <td>0.107933</td>\n",
       "      <td>0.0918</td>\n",
       "      <td>0.0918</td>\n",
       "      <td>0.1402</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29403 rows × 233 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "loci            HLA-A                                                         \\\n",
       "genotype   HLA-A01:01 HLA-A01:02 HLA-A01:03 HLA-A01:09 HLA-A01:23 HLA-A02:01   \n",
       "Peptide                                                                        \n",
       "AAAYYVGY       0.0967     0.1644     0.0733     0.0967     0.0864     0.0278   \n",
       "AAAYYVGYL      0.0879     0.1206     0.0863     0.0879     0.0815     0.2249   \n",
       "AAAYYVGYLQ     0.0527     0.0837     0.0451     0.0527     0.0465     0.0738   \n",
       "AACCHLAK       0.0420     0.0789     0.0354     0.0420     0.0333     0.0287   \n",
       "AACCHLAKA      0.0535     0.0787     0.0447     0.0535     0.0498     0.1479   \n",
       "...               ...        ...        ...        ...        ...        ...   \n",
       "YYVGYLQPR      0.0376     0.1285     0.0346     0.0376     0.0438     0.1105   \n",
       "YYVGYLQPRT     0.0351     0.0759     0.0322     0.0351     0.0361     0.1011   \n",
       "YYVWKSYV       0.0316     0.0701     0.0306     0.0316     0.0310     0.0748   \n",
       "YYVWKSYVH      0.0479     0.1382     0.0443     0.0479     0.0526     0.0411   \n",
       "YYVWKSYVHV     0.0618     0.1085     0.0609     0.0618     0.0654     0.5143   \n",
       "\n",
       "loci                                                    ...      HLA-C  \\\n",
       "genotype   HLA-A02:02 HLA-A02:03 HLA-A02:04 HLA-A02:05  ... HLA-C17:02   \n",
       "Peptide                                                 ...              \n",
       "AAAYYVGY       0.0401     0.0417     0.0199     0.0495  ...     0.0394   \n",
       "AAAYYVGYL      0.4798     0.4420     0.2050     0.5578  ...     0.5393   \n",
       "AAAYYVGYLQ     0.1754     0.1566     0.0542     0.2006  ...     0.0808   \n",
       "AACCHLAK       0.0302     0.0286     0.0236     0.0363  ...     0.0167   \n",
       "AACCHLAKA      0.2237     0.3273     0.0916     0.3419  ...     0.0691   \n",
       "...               ...        ...        ...        ...  ...        ...   \n",
       "YYVGYLQPR      0.1362     0.1120     0.1013     0.1849  ...     0.0390   \n",
       "YYVGYLQPRT     0.1517     0.1558     0.0877     0.2004  ...     0.0266   \n",
       "YYVWKSYV       0.1008     0.1146     0.0704     0.1010  ...     0.0753   \n",
       "YYVWKSYVH      0.0492     0.0324     0.0332     0.0569  ...     0.0415   \n",
       "YYVWKSYVHV     0.4322     0.4541     0.3955     0.4248  ...     0.1365   \n",
       "\n",
       "loci                                                                         \\\n",
       "genotype   HLA-C17:03 HLA-C17:04 HLA-C17:05 HLA-C17:06 HLA-C17:07   HLA-C18   \n",
       "Peptide                                                                       \n",
       "AAAYYVGY       0.0394     0.0394     0.0394     0.0394     0.0303  0.020300   \n",
       "AAAYYVGYL      0.5393     0.5393     0.5393     0.5393     0.4357  0.110533   \n",
       "AAAYYVGYLQ     0.0808     0.0808     0.0808     0.0808     0.0553  0.028433   \n",
       "AACCHLAK       0.0167     0.0167     0.0167     0.0167     0.0126  0.015900   \n",
       "AACCHLAKA      0.0691     0.0691     0.0691     0.0691     0.0460  0.038067   \n",
       "...               ...        ...        ...        ...        ...       ...   \n",
       "YYVGYLQPR      0.0390     0.0390     0.0390     0.0390     0.0276  0.057833   \n",
       "YYVGYLQPRT     0.0266     0.0266     0.0266     0.0266     0.0191  0.047067   \n",
       "YYVWKSYV       0.0753     0.0753     0.0753     0.0753     0.0576  0.117933   \n",
       "YYVWKSYVH      0.0415     0.0415     0.0415     0.0415     0.0372  0.053200   \n",
       "YYVWKSYVHV     0.1365     0.1365     0.1365     0.1365     0.1036  0.107933   \n",
       "\n",
       "loci                                         \n",
       "genotype   HLA-C18:01 HLA-C18:02 HLA-C18:03  \n",
       "Peptide                                      \n",
       "AAAYYVGY       0.0217     0.0217     0.0175  \n",
       "AAAYYVGYL      0.0931     0.0931     0.1454  \n",
       "AAAYYVGYLQ     0.0266     0.0266     0.0321  \n",
       "AACCHLAK       0.0184     0.0184     0.0109  \n",
       "AACCHLAKA      0.0366     0.0366     0.0410  \n",
       "...               ...        ...        ...  \n",
       "YYVGYLQPR      0.0550     0.0550     0.0635  \n",
       "YYVGYLQPRT     0.0413     0.0413     0.0586  \n",
       "YYVWKSYV       0.1067     0.1067     0.1404  \n",
       "YYVWKSYVH      0.0515     0.0515     0.0566  \n",
       "YYVWKSYVHV     0.0918     0.0918     0.1402  \n",
       "\n",
       "[29403 rows x 233 columns]"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_pivot = pd.concat([netmhc41_data, a74, c17, c18], sort=False).pivot_table(\n",
    "    index='Peptide',\n",
    "    columns=['loci', 'genotype'],\n",
    "    values='BA-score',\n",
    ")\n",
    "data_pivot.to_pickle('mhc1_haplotype_netmhc-4.1_pred_affinity_pivot.pkl.gz', protocol=2)\n",
    "data_pivot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create data for MHCflurry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>protein</th>\n",
       "      <th>sequence</th>\n",
       "      <th>start_pos</th>\n",
       "      <th>epi_len</th>\n",
       "      <th>entropy</th>\n",
       "      <th>perc_mutated</th>\n",
       "      <th>glyco_probs</th>\n",
       "      <th>crosses_cleavage</th>\n",
       "      <th>sequence_length</th>\n",
       "      <th>allele</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-A30:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763375</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B55:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763376</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B67:01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763377</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-A24:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763378</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B15:32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763379</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6763380 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        protein    sequence  start_pos  epi_len   entropy  perc_mutated  \\\n",
       "0             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "1             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "2             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "3             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "4             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "...         ...         ...        ...      ...       ...           ...   \n",
       "6763375      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763376      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763377      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763378      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763379      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "\n",
       "         glyco_probs  crosses_cleavage  sequence_length      allele  \n",
       "0                0.0                 0                8  HLA-B44:04  \n",
       "1                0.0                 0                8  HLA-B44:05  \n",
       "2                0.0                 0                8  HLA-B44:07  \n",
       "3                0.0                 0                8  HLA-A30:10  \n",
       "4                0.0                 0                8  HLA-B44:02  \n",
       "...              ...               ...              ...         ...  \n",
       "6763375          0.0                 0               10  HLA-B55:02  \n",
       "6763376          0.0                 0               10  HLA-B67:01  \n",
       "6763377          0.0                 0               10  HLA-A24:10  \n",
       "6763378          0.0                 0               10  HLA-B15:32  \n",
       "6763379          0.0                 0               10  HLA-B56:10  \n",
       "\n",
       "[6763380 rows x 10 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create dataframe with all MHC/peptide pairs.\n",
    "a = mhc1_data.copy()\n",
    "b = hla_alleles.copy()\n",
    "a['key'] = 0\n",
    "b['key'] = 0\n",
    "pmhc_pairs = a.merge(b, how='outer')\n",
    "pmhc_pairs = pmhc_pairs.drop(columns=['key'])\n",
    "pmhc_pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "pmhc_pairs[['allele', 'sequence']].to_csv('mhc1_8-10_haplotype_pairs.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "pmhc_pairs.rename(columns={'sequence': 'peptide'})[['allele', 'peptide']].to_csv(\n",
    "    'mhc1_8-10_haplotype_pairs_withheader.csv', index=False, header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Command to run MHCflurry:\n",
    "```\n",
    "mhcflurry-predict \\\n",
    "    --out mhc1_haplotype_preds_mhcflurry.csv \\\n",
    "    mhc1_8-10_haplotype_pairs_withheader.csv\n",
    "\n",
    "mhcflurry-predict \\\n",
    "    --out mhc1_haplotype_preds_mhcflurry2.0.csv \\\n",
    "    mhc1_8-10_haplotype_pairs_withheader.csv\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load MHCflurry Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform_affinity(x):\n",
    "    x = np.clip(x, a_min=None, a_max=50000)\n",
    "    return 1 - np.log(x) / np.log(50000)\n",
    "\n",
    "# print(transform_affinity(500))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>allele</th>\n",
       "      <th>sequence</th>\n",
       "      <th>mhcflurry_affinity</th>\n",
       "      <th>mhcflurry_affinity_percentile</th>\n",
       "      <th>mhcflurry_processing_score</th>\n",
       "      <th>mhcflurry_presentation_score</th>\n",
       "      <th>sequence_length</th>\n",
       "      <th>transformed_aff</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>37829.751440</td>\n",
       "      <td>76.370250</td>\n",
       "      <td>0.213669</td>\n",
       "      <td>0.007210</td>\n",
       "      <td>8</td>\n",
       "      <td>0.025779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HLA-B44:05</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38198.075124</td>\n",
       "      <td>84.530375</td>\n",
       "      <td>0.213669</td>\n",
       "      <td>0.007148</td>\n",
       "      <td>8</td>\n",
       "      <td>0.024884</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HLA-B44:07</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38064.733103</td>\n",
       "      <td>73.786875</td>\n",
       "      <td>0.213669</td>\n",
       "      <td>0.007170</td>\n",
       "      <td>8</td>\n",
       "      <td>0.025207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HLA-A30:10</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>34529.712035</td>\n",
       "      <td>57.241500</td>\n",
       "      <td>0.213669</td>\n",
       "      <td>0.007814</td>\n",
       "      <td>8</td>\n",
       "      <td>0.034215</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HLA-B44:02</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38191.475075</td>\n",
       "      <td>83.348375</td>\n",
       "      <td>0.213669</td>\n",
       "      <td>0.007149</td>\n",
       "      <td>8</td>\n",
       "      <td>0.024900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762685</th>\n",
       "      <td>HLA-B55:02</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>29249.898280</td>\n",
       "      <td>38.170875</td>\n",
       "      <td>0.126139</td>\n",
       "      <td>0.006637</td>\n",
       "      <td>10</td>\n",
       "      <td>0.049552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762686</th>\n",
       "      <td>HLA-B67:01</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>34574.906172</td>\n",
       "      <td>41.093875</td>\n",
       "      <td>0.126139</td>\n",
       "      <td>0.005726</td>\n",
       "      <td>10</td>\n",
       "      <td>0.034094</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762687</th>\n",
       "      <td>HLA-A24:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>35345.127043</td>\n",
       "      <td>37.469125</td>\n",
       "      <td>0.126139</td>\n",
       "      <td>0.005615</td>\n",
       "      <td>10</td>\n",
       "      <td>0.032058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762688</th>\n",
       "      <td>HLA-B15:32</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>32874.896891</td>\n",
       "      <td>28.702875</td>\n",
       "      <td>0.126139</td>\n",
       "      <td>0.005986</td>\n",
       "      <td>10</td>\n",
       "      <td>0.038754</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762689</th>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>27169.830276</td>\n",
       "      <td>38.733625</td>\n",
       "      <td>0.126139</td>\n",
       "      <td>0.007084</td>\n",
       "      <td>10</td>\n",
       "      <td>0.056370</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             allele    sequence  mhcflurry_affinity  \\\n",
       "0        HLA-B44:04    MYSFVSEE        37829.751440   \n",
       "1        HLA-B44:05    MYSFVSEE        38198.075124   \n",
       "2        HLA-B44:07    MYSFVSEE        38064.733103   \n",
       "3        HLA-A30:10    MYSFVSEE        34529.712035   \n",
       "4        HLA-B44:02    MYSFVSEE        38191.475075   \n",
       "...             ...         ...                 ...   \n",
       "6762685  HLA-B55:02  VLKGVKLHYT        29249.898280   \n",
       "6762686  HLA-B67:01  VLKGVKLHYT        34574.906172   \n",
       "6762687  HLA-A24:10  VLKGVKLHYT        35345.127043   \n",
       "6762688  HLA-B15:32  VLKGVKLHYT        32874.896891   \n",
       "6762689  HLA-B56:10  VLKGVKLHYT        27169.830276   \n",
       "\n",
       "         mhcflurry_affinity_percentile  mhcflurry_processing_score  \\\n",
       "0                            76.370250                    0.213669   \n",
       "1                            84.530375                    0.213669   \n",
       "2                            73.786875                    0.213669   \n",
       "3                            57.241500                    0.213669   \n",
       "4                            83.348375                    0.213669   \n",
       "...                                ...                         ...   \n",
       "6762685                      38.170875                    0.126139   \n",
       "6762686                      41.093875                    0.126139   \n",
       "6762687                      37.469125                    0.126139   \n",
       "6762688                      28.702875                    0.126139   \n",
       "6762689                      38.733625                    0.126139   \n",
       "\n",
       "         mhcflurry_presentation_score  sequence_length  transformed_aff  \n",
       "0                            0.007210                8         0.025779  \n",
       "1                            0.007148                8         0.024884  \n",
       "2                            0.007170                8         0.025207  \n",
       "3                            0.007814                8         0.034215  \n",
       "4                            0.007149                8         0.024900  \n",
       "...                               ...              ...              ...  \n",
       "6762685                      0.006637               10         0.049552  \n",
       "6762686                      0.005726               10         0.034094  \n",
       "6762687                      0.005615               10         0.032058  \n",
       "6762688                      0.005986               10         0.038754  \n",
       "6762689                      0.007084               10         0.056370  \n",
       "\n",
       "[6762690 rows x 8 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mhcflurry_preds = pd.read_csv('mhc1_haplotype_preds_mhcflurry.csv')\n",
    "\n",
    "mhcflurry_preds = mhcflurry_preds.rename(columns={'peptide': 'sequence'})\n",
    "mhcflurry_preds['sequence_length'] = [len(x) for x in mhcflurry_preds['sequence'].values]\n",
    "\n",
    "# Filter dataframe to alleles in hla_alleles.\n",
    "# mhcflurry_preds = mhcflurry_preds.merge(hla_alleles, on='allele')\n",
    "\n",
    "# Add epitope protein data.\n",
    "# mhcflurry_preds = mhcflurry_preds.merge(mhc1_data[['sequence', 'Protein']], on='sequence')\n",
    "\n",
    "# Compute logistic-transformed binding affinity.\n",
    "mhcflurry_preds['transformed_aff'] = [transform_affinity(x) for x in mhcflurry_preds['mhcflurry_affinity'].values]\n",
    "\n",
    "mhcflurry_preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genotype</th>\n",
       "      <th>Peptide</th>\n",
       "      <th>mhcflurry_affinity</th>\n",
       "      <th>transformed_aff</th>\n",
       "      <th>loci</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>37829.751440</td>\n",
       "      <td>0.025779</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HLA-B44:05</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38198.075124</td>\n",
       "      <td>0.024884</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HLA-B44:07</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38064.733103</td>\n",
       "      <td>0.025207</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HLA-A30:10</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>34529.712035</td>\n",
       "      <td>0.034215</td>\n",
       "      <td>HLA-A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HLA-B44:02</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38191.475075</td>\n",
       "      <td>0.024900</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762685</th>\n",
       "      <td>HLA-B55:02</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>29249.898280</td>\n",
       "      <td>0.049552</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762686</th>\n",
       "      <td>HLA-B67:01</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>34574.906172</td>\n",
       "      <td>0.034094</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762687</th>\n",
       "      <td>HLA-A24:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>35345.127043</td>\n",
       "      <td>0.032058</td>\n",
       "      <td>HLA-A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762688</th>\n",
       "      <td>HLA-B15:32</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>32874.896891</td>\n",
       "      <td>0.038754</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762689</th>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>27169.830276</td>\n",
       "      <td>0.056370</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           genotype     Peptide  mhcflurry_affinity  transformed_aff   loci\n",
       "0        HLA-B44:04    MYSFVSEE        37829.751440         0.025779  HLA-B\n",
       "1        HLA-B44:05    MYSFVSEE        38198.075124         0.024884  HLA-B\n",
       "2        HLA-B44:07    MYSFVSEE        38064.733103         0.025207  HLA-B\n",
       "3        HLA-A30:10    MYSFVSEE        34529.712035         0.034215  HLA-A\n",
       "4        HLA-B44:02    MYSFVSEE        38191.475075         0.024900  HLA-B\n",
       "...             ...         ...                 ...              ...    ...\n",
       "6762685  HLA-B55:02  VLKGVKLHYT        29249.898280         0.049552  HLA-B\n",
       "6762686  HLA-B67:01  VLKGVKLHYT        34574.906172         0.034094  HLA-B\n",
       "6762687  HLA-A24:10  VLKGVKLHYT        35345.127043         0.032058  HLA-A\n",
       "6762688  HLA-B15:32  VLKGVKLHYT        32874.896891         0.038754  HLA-B\n",
       "6762689  HLA-B56:10  VLKGVKLHYT        27169.830276         0.056370  HLA-B\n",
       "\n",
       "[6762690 rows x 5 columns]"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = mhcflurry_preds.copy()\n",
    "df['loci'] = [x[:5] for x in df['allele'].values]\n",
    "df = df.drop(columns=['sequence_length', 'mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score'])\n",
    "df = df.rename(columns={'sequence': 'Peptide', 'allele': 'genotype'})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])\n",
    "a74['loci'] = 'HLA-A'\n",
    "a74['genotype'] = 'HLA-A74'\n",
    "#a74\n",
    "\n",
    "c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()\n",
    "c17['loci'] = 'HLA-C'\n",
    "c17['genotype'] = 'HLA-C17'\n",
    "#c17\n",
    "\n",
    "c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()\n",
    "c18['loci'] = 'HLA-C'\n",
    "c18['genotype'] = 'HLA-C18'\n",
    "#c18"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genotype</th>\n",
       "      <th>Peptide</th>\n",
       "      <th>mhcflurry_affinity</th>\n",
       "      <th>transformed_aff</th>\n",
       "      <th>loci</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>37829.751440</td>\n",
       "      <td>0.025779</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HLA-B44:05</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38198.075124</td>\n",
       "      <td>0.024884</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HLA-B44:07</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38064.733103</td>\n",
       "      <td>0.025207</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HLA-A30:10</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>34529.712035</td>\n",
       "      <td>0.034215</td>\n",
       "      <td>HLA-A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HLA-B44:02</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38191.475075</td>\n",
       "      <td>0.024900</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29398</th>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>YYVGYLQPR</td>\n",
       "      <td>20470.598345</td>\n",
       "      <td>0.082537</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29399</th>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>YYVGYLQPRT</td>\n",
       "      <td>33421.974532</td>\n",
       "      <td>0.037235</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29400</th>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>YYVWKSYV</td>\n",
       "      <td>22235.680609</td>\n",
       "      <td>0.077440</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29401</th>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>YYVWKSYVH</td>\n",
       "      <td>30913.271511</td>\n",
       "      <td>0.044532</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29402</th>\n",
       "      <td>HLA-C18</td>\n",
       "      <td>YYVWKSYVHV</td>\n",
       "      <td>25843.478991</td>\n",
       "      <td>0.061220</td>\n",
       "      <td>HLA-C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6850899 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         genotype     Peptide  mhcflurry_affinity  transformed_aff   loci\n",
       "0      HLA-B44:04    MYSFVSEE        37829.751440         0.025779  HLA-B\n",
       "1      HLA-B44:05    MYSFVSEE        38198.075124         0.024884  HLA-B\n",
       "2      HLA-B44:07    MYSFVSEE        38064.733103         0.025207  HLA-B\n",
       "3      HLA-A30:10    MYSFVSEE        34529.712035         0.034215  HLA-A\n",
       "4      HLA-B44:02    MYSFVSEE        38191.475075         0.024900  HLA-B\n",
       "...           ...         ...                 ...              ...    ...\n",
       "29398     HLA-C18   YYVGYLQPR        20470.598345         0.082537  HLA-C\n",
       "29399     HLA-C18  YYVGYLQPRT        33421.974532         0.037235  HLA-C\n",
       "29400     HLA-C18    YYVWKSYV        22235.680609         0.077440  HLA-C\n",
       "29401     HLA-C18   YYVWKSYVH        30913.271511         0.044532  HLA-C\n",
       "29402     HLA-C18  YYVWKSYVHV        25843.478991         0.061220  HLA-C\n",
       "\n",
       "[6850899 rows x 5 columns]"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([df, a74, c17, c18], sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>loci</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-A</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-C</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genotype</th>\n",
       "      <th>HLA-A01:01</th>\n",
       "      <th>HLA-A01:02</th>\n",
       "      <th>HLA-A01:03</th>\n",
       "      <th>HLA-A01:09</th>\n",
       "      <th>HLA-A01:23</th>\n",
       "      <th>HLA-A02:01</th>\n",
       "      <th>HLA-A02:02</th>\n",
       "      <th>HLA-A02:03</th>\n",
       "      <th>HLA-A02:04</th>\n",
       "      <th>HLA-A02:05</th>\n",
       "      <th>...</th>\n",
       "      <th>HLA-C17:02</th>\n",
       "      <th>HLA-C17:03</th>\n",
       "      <th>HLA-C17:04</th>\n",
       "      <th>HLA-C17:05</th>\n",
       "      <th>HLA-C17:06</th>\n",
       "      <th>HLA-C17:07</th>\n",
       "      <th>HLA-C18</th>\n",
       "      <th>HLA-C18:01</th>\n",
       "      <th>HLA-C18:02</th>\n",
       "      <th>HLA-C18:03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peptide</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAAYYVGY</th>\n",
       "      <td>0.094719</td>\n",
       "      <td>0.096633</td>\n",
       "      <td>0.065167</td>\n",
       "      <td>0.094719</td>\n",
       "      <td>0.096370</td>\n",
       "      <td>0.032908</td>\n",
       "      <td>0.033360</td>\n",
       "      <td>0.031451</td>\n",
       "      <td>0.038903</td>\n",
       "      <td>0.036177</td>\n",
       "      <td>...</td>\n",
       "      <td>0.058567</td>\n",
       "      <td>0.058567</td>\n",
       "      <td>0.058567</td>\n",
       "      <td>0.058567</td>\n",
       "      <td>0.050586</td>\n",
       "      <td>0.045115</td>\n",
       "      <td>0.032813</td>\n",
       "      <td>0.032292</td>\n",
       "      <td>0.032292</td>\n",
       "      <td>0.033854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYL</th>\n",
       "      <td>0.051921</td>\n",
       "      <td>0.079042</td>\n",
       "      <td>0.045578</td>\n",
       "      <td>0.051921</td>\n",
       "      <td>0.052842</td>\n",
       "      <td>0.249069</td>\n",
       "      <td>0.398928</td>\n",
       "      <td>0.321426</td>\n",
       "      <td>0.505718</td>\n",
       "      <td>0.547036</td>\n",
       "      <td>...</td>\n",
       "      <td>0.662885</td>\n",
       "      <td>0.662885</td>\n",
       "      <td>0.662885</td>\n",
       "      <td>0.662885</td>\n",
       "      <td>0.635438</td>\n",
       "      <td>0.532602</td>\n",
       "      <td>0.123593</td>\n",
       "      <td>0.099414</td>\n",
       "      <td>0.099414</td>\n",
       "      <td>0.171951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYLQ</th>\n",
       "      <td>0.034828</td>\n",
       "      <td>0.045169</td>\n",
       "      <td>0.032080</td>\n",
       "      <td>0.034828</td>\n",
       "      <td>0.034498</td>\n",
       "      <td>0.035324</td>\n",
       "      <td>0.036939</td>\n",
       "      <td>0.033438</td>\n",
       "      <td>0.038991</td>\n",
       "      <td>0.040745</td>\n",
       "      <td>...</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.034922</td>\n",
       "      <td>0.034588</td>\n",
       "      <td>0.032520</td>\n",
       "      <td>0.031588</td>\n",
       "      <td>0.031588</td>\n",
       "      <td>0.034383</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAK</th>\n",
       "      <td>0.037984</td>\n",
       "      <td>0.044418</td>\n",
       "      <td>0.033238</td>\n",
       "      <td>0.037984</td>\n",
       "      <td>0.038412</td>\n",
       "      <td>0.040850</td>\n",
       "      <td>0.039012</td>\n",
       "      <td>0.035140</td>\n",
       "      <td>0.048772</td>\n",
       "      <td>0.041690</td>\n",
       "      <td>...</td>\n",
       "      <td>0.086629</td>\n",
       "      <td>0.086629</td>\n",
       "      <td>0.086629</td>\n",
       "      <td>0.086629</td>\n",
       "      <td>0.072791</td>\n",
       "      <td>0.092736</td>\n",
       "      <td>0.045756</td>\n",
       "      <td>0.044588</td>\n",
       "      <td>0.044588</td>\n",
       "      <td>0.048091</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAKA</th>\n",
       "      <td>0.046057</td>\n",
       "      <td>0.063561</td>\n",
       "      <td>0.039850</td>\n",
       "      <td>0.046057</td>\n",
       "      <td>0.045247</td>\n",
       "      <td>0.405164</td>\n",
       "      <td>0.515754</td>\n",
       "      <td>0.503002</td>\n",
       "      <td>0.495733</td>\n",
       "      <td>0.639563</td>\n",
       "      <td>...</td>\n",
       "      <td>0.304727</td>\n",
       "      <td>0.304727</td>\n",
       "      <td>0.304727</td>\n",
       "      <td>0.304727</td>\n",
       "      <td>0.279128</td>\n",
       "      <td>0.230489</td>\n",
       "      <td>0.071056</td>\n",
       "      <td>0.059713</td>\n",
       "      <td>0.059713</td>\n",
       "      <td>0.093743</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPR</th>\n",
       "      <td>0.053433</td>\n",
       "      <td>0.101891</td>\n",
       "      <td>0.045845</td>\n",
       "      <td>0.053433</td>\n",
       "      <td>0.054487</td>\n",
       "      <td>0.087990</td>\n",
       "      <td>0.085834</td>\n",
       "      <td>0.075096</td>\n",
       "      <td>0.142467</td>\n",
       "      <td>0.085585</td>\n",
       "      <td>...</td>\n",
       "      <td>0.062016</td>\n",
       "      <td>0.062016</td>\n",
       "      <td>0.062016</td>\n",
       "      <td>0.062016</td>\n",
       "      <td>0.056933</td>\n",
       "      <td>0.067355</td>\n",
       "      <td>0.082537</td>\n",
       "      <td>0.082538</td>\n",
       "      <td>0.082538</td>\n",
       "      <td>0.082535</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPRT</th>\n",
       "      <td>0.026303</td>\n",
       "      <td>0.034723</td>\n",
       "      <td>0.024996</td>\n",
       "      <td>0.026303</td>\n",
       "      <td>0.026396</td>\n",
       "      <td>0.036421</td>\n",
       "      <td>0.042080</td>\n",
       "      <td>0.035949</td>\n",
       "      <td>0.052389</td>\n",
       "      <td>0.043779</td>\n",
       "      <td>...</td>\n",
       "      <td>0.028586</td>\n",
       "      <td>0.028586</td>\n",
       "      <td>0.028586</td>\n",
       "      <td>0.028586</td>\n",
       "      <td>0.027950</td>\n",
       "      <td>0.028314</td>\n",
       "      <td>0.037235</td>\n",
       "      <td>0.037964</td>\n",
       "      <td>0.037964</td>\n",
       "      <td>0.035777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYV</th>\n",
       "      <td>0.028913</td>\n",
       "      <td>0.037161</td>\n",
       "      <td>0.026876</td>\n",
       "      <td>0.028913</td>\n",
       "      <td>0.028558</td>\n",
       "      <td>0.039297</td>\n",
       "      <td>0.038931</td>\n",
       "      <td>0.034029</td>\n",
       "      <td>0.050133</td>\n",
       "      <td>0.039407</td>\n",
       "      <td>...</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.037265</td>\n",
       "      <td>0.034266</td>\n",
       "      <td>0.077440</td>\n",
       "      <td>0.092437</td>\n",
       "      <td>0.092437</td>\n",
       "      <td>0.047445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVH</th>\n",
       "      <td>0.044654</td>\n",
       "      <td>0.075975</td>\n",
       "      <td>0.037013</td>\n",
       "      <td>0.044654</td>\n",
       "      <td>0.044827</td>\n",
       "      <td>0.036534</td>\n",
       "      <td>0.032578</td>\n",
       "      <td>0.030818</td>\n",
       "      <td>0.042809</td>\n",
       "      <td>0.032589</td>\n",
       "      <td>...</td>\n",
       "      <td>0.032421</td>\n",
       "      <td>0.032421</td>\n",
       "      <td>0.032421</td>\n",
       "      <td>0.032421</td>\n",
       "      <td>0.031183</td>\n",
       "      <td>0.031946</td>\n",
       "      <td>0.044532</td>\n",
       "      <td>0.047430</td>\n",
       "      <td>0.047430</td>\n",
       "      <td>0.038737</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVHV</th>\n",
       "      <td>0.029647</td>\n",
       "      <td>0.043371</td>\n",
       "      <td>0.027804</td>\n",
       "      <td>0.029647</td>\n",
       "      <td>0.029193</td>\n",
       "      <td>0.073780</td>\n",
       "      <td>0.083830</td>\n",
       "      <td>0.058161</td>\n",
       "      <td>0.112460</td>\n",
       "      <td>0.093533</td>\n",
       "      <td>...</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.033998</td>\n",
       "      <td>0.033561</td>\n",
       "      <td>0.061220</td>\n",
       "      <td>0.065734</td>\n",
       "      <td>0.065734</td>\n",
       "      <td>0.052191</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29403 rows × 233 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "loci            HLA-A                                                         \\\n",
       "genotype   HLA-A01:01 HLA-A01:02 HLA-A01:03 HLA-A01:09 HLA-A01:23 HLA-A02:01   \n",
       "Peptide                                                                        \n",
       "AAAYYVGY     0.094719   0.096633   0.065167   0.094719   0.096370   0.032908   \n",
       "AAAYYVGYL    0.051921   0.079042   0.045578   0.051921   0.052842   0.249069   \n",
       "AAAYYVGYLQ   0.034828   0.045169   0.032080   0.034828   0.034498   0.035324   \n",
       "AACCHLAK     0.037984   0.044418   0.033238   0.037984   0.038412   0.040850   \n",
       "AACCHLAKA    0.046057   0.063561   0.039850   0.046057   0.045247   0.405164   \n",
       "...               ...        ...        ...        ...        ...        ...   \n",
       "YYVGYLQPR    0.053433   0.101891   0.045845   0.053433   0.054487   0.087990   \n",
       "YYVGYLQPRT   0.026303   0.034723   0.024996   0.026303   0.026396   0.036421   \n",
       "YYVWKSYV     0.028913   0.037161   0.026876   0.028913   0.028558   0.039297   \n",
       "YYVWKSYVH    0.044654   0.075975   0.037013   0.044654   0.044827   0.036534   \n",
       "YYVWKSYVHV   0.029647   0.043371   0.027804   0.029647   0.029193   0.073780   \n",
       "\n",
       "loci                                                    ...      HLA-C  \\\n",
       "genotype   HLA-A02:02 HLA-A02:03 HLA-A02:04 HLA-A02:05  ... HLA-C17:02   \n",
       "Peptide                                                 ...              \n",
       "AAAYYVGY     0.033360   0.031451   0.038903   0.036177  ...   0.058567   \n",
       "AAAYYVGYL    0.398928   0.321426   0.505718   0.547036  ...   0.662885   \n",
       "AAAYYVGYLQ   0.036939   0.033438   0.038991   0.040745  ...   0.038188   \n",
       "AACCHLAK     0.039012   0.035140   0.048772   0.041690  ...   0.086629   \n",
       "AACCHLAKA    0.515754   0.503002   0.495733   0.639563  ...   0.304727   \n",
       "...               ...        ...        ...        ...  ...        ...   \n",
       "YYVGYLQPR    0.085834   0.075096   0.142467   0.085585  ...   0.062016   \n",
       "YYVGYLQPRT   0.042080   0.035949   0.052389   0.043779  ...   0.028586   \n",
       "YYVWKSYV     0.038931   0.034029   0.050133   0.039407  ...   0.039761   \n",
       "YYVWKSYVH    0.032578   0.030818   0.042809   0.032589  ...   0.032421   \n",
       "YYVWKSYVHV   0.083830   0.058161   0.112460   0.093533  ...   0.036095   \n",
       "\n",
       "loci                                                                         \\\n",
       "genotype   HLA-C17:03 HLA-C17:04 HLA-C17:05 HLA-C17:06 HLA-C17:07   HLA-C18   \n",
       "Peptide                                                                       \n",
       "AAAYYVGY     0.058567   0.058567   0.058567   0.050586   0.045115  0.032813   \n",
       "AAAYYVGYL    0.662885   0.662885   0.662885   0.635438   0.532602  0.123593   \n",
       "AAAYYVGYLQ   0.038188   0.038188   0.038188   0.034922   0.034588  0.032520   \n",
       "AACCHLAK     0.086629   0.086629   0.086629   0.072791   0.092736  0.045756   \n",
       "AACCHLAKA    0.304727   0.304727   0.304727   0.279128   0.230489  0.071056   \n",
       "...               ...        ...        ...        ...        ...       ...   \n",
       "YYVGYLQPR    0.062016   0.062016   0.062016   0.056933   0.067355  0.082537   \n",
       "YYVGYLQPRT   0.028586   0.028586   0.028586   0.027950   0.028314  0.037235   \n",
       "YYVWKSYV     0.039761   0.039761   0.039761   0.037265   0.034266  0.077440   \n",
       "YYVWKSYVH    0.032421   0.032421   0.032421   0.031183   0.031946  0.044532   \n",
       "YYVWKSYVHV   0.036095   0.036095   0.036095   0.033998   0.033561  0.061220   \n",
       "\n",
       "loci                                         \n",
       "genotype   HLA-C18:01 HLA-C18:02 HLA-C18:03  \n",
       "Peptide                                      \n",
       "AAAYYVGY     0.032292   0.032292   0.033854  \n",
       "AAAYYVGYL    0.099414   0.099414   0.171951  \n",
       "AAAYYVGYLQ   0.031588   0.031588   0.034383  \n",
       "AACCHLAK     0.044588   0.044588   0.048091  \n",
       "AACCHLAKA    0.059713   0.059713   0.093743  \n",
       "...               ...        ...        ...  \n",
       "YYVGYLQPR    0.082538   0.082538   0.082535  \n",
       "YYVGYLQPRT   0.037964   0.037964   0.035777  \n",
       "YYVWKSYV     0.092437   0.092437   0.047445  \n",
       "YYVWKSYVH    0.047430   0.047430   0.038737  \n",
       "YYVWKSYVHV   0.065734   0.065734   0.052191  \n",
       "\n",
       "[29403 rows x 233 columns]"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(\n",
    "    index='Peptide',\n",
    "    columns=['loci', 'genotype'],\n",
    "    values='transformed_aff',\n",
    ")\n",
    "data_pivot.to_pickle('mhc1_haplotype_mhcflurry_pred_affinity_pivot.pkl.gz', protocol=2)\n",
    "data_pivot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = mhcflurry_preds.copy()\n",
    "df['loci'] = [x[:5] for x in df['allele'].values]\n",
    "df = df.drop(columns=['sequence_length', 'mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score'])\n",
    "df = df.rename(columns={'sequence': 'Peptide', 'allele': 'genotype'})\n",
    "\n",
    "# df2 = df.groupby(['Peptide', 'loci']).count().reset_index()[['Peptide', 'loci']]\n",
    "# df2['genotype'] = 'unknown'\n",
    "# df2['transformed_aff'] = 0.\n",
    "# df2['mhcflurry_affinity'] = 0.\n",
    "\n",
    "# df_with_unknown = pd.concat([df, df2], sort=False)\n",
    "\n",
    "# data_pivot = df_with_unknown.pivot_table(\n",
    "#     index='Peptide',\n",
    "#     columns=['loci', 'genotype'],\n",
    "#     values='transformed_aff',\n",
    "# )\n",
    "\n",
    "data_pivot = df.pivot_table(\n",
    "    index='Peptide',\n",
    "    columns=['loci', 'genotype'],\n",
    "    values='transformed_aff',\n",
    ")\n",
    "# data_pivot.to_pickle('mhc1_haplotype_mhcflurry_pred_affinity_pivot.pkl.gz', protocol=2)\n",
    "data_pivot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load MHCflurry 2.0 Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genotype</th>\n",
       "      <th>Peptide</th>\n",
       "      <th>mhcflurry_affinity</th>\n",
       "      <th>mhcflurry_presentation_percentile</th>\n",
       "      <th>transformed_aff</th>\n",
       "      <th>loci</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>33181.568128</td>\n",
       "      <td>37.359049</td>\n",
       "      <td>0.037896</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HLA-B44:05</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>33603.431308</td>\n",
       "      <td>37.359049</td>\n",
       "      <td>0.036729</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HLA-B44:07</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>33443.140537</td>\n",
       "      <td>37.359049</td>\n",
       "      <td>0.037170</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HLA-A30:10</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>30462.139109</td>\n",
       "      <td>37.359049</td>\n",
       "      <td>0.045799</td>\n",
       "      <td>HLA-A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HLA-B44:02</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>33494.479955</td>\n",
       "      <td>37.359049</td>\n",
       "      <td>0.037029</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762685</th>\n",
       "      <td>HLA-B55:02</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>27082.660337</td>\n",
       "      <td>46.224674</td>\n",
       "      <td>0.056667</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762686</th>\n",
       "      <td>HLA-B67:01</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>32158.622657</td>\n",
       "      <td>62.744674</td>\n",
       "      <td>0.040790</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762687</th>\n",
       "      <td>HLA-A24:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>31610.775717</td>\n",
       "      <td>62.744674</td>\n",
       "      <td>0.042378</td>\n",
       "      <td>HLA-A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762688</th>\n",
       "      <td>HLA-B15:32</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>30062.941186</td>\n",
       "      <td>46.224674</td>\n",
       "      <td>0.047019</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762689</th>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>28585.854398</td>\n",
       "      <td>46.224674</td>\n",
       "      <td>0.051675</td>\n",
       "      <td>HLA-B</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           genotype     Peptide  mhcflurry_affinity  \\\n",
       "0        HLA-B44:04    MYSFVSEE        33181.568128   \n",
       "1        HLA-B44:05    MYSFVSEE        33603.431308   \n",
       "2        HLA-B44:07    MYSFVSEE        33443.140537   \n",
       "3        HLA-A30:10    MYSFVSEE        30462.139109   \n",
       "4        HLA-B44:02    MYSFVSEE        33494.479955   \n",
       "...             ...         ...                 ...   \n",
       "6762685  HLA-B55:02  VLKGVKLHYT        27082.660337   \n",
       "6762686  HLA-B67:01  VLKGVKLHYT        32158.622657   \n",
       "6762687  HLA-A24:10  VLKGVKLHYT        31610.775717   \n",
       "6762688  HLA-B15:32  VLKGVKLHYT        30062.941186   \n",
       "6762689  HLA-B56:10  VLKGVKLHYT        28585.854398   \n",
       "\n",
       "         mhcflurry_presentation_percentile  transformed_aff   loci  \n",
       "0                                37.359049         0.037896  HLA-B  \n",
       "1                                37.359049         0.036729  HLA-B  \n",
       "2                                37.359049         0.037170  HLA-B  \n",
       "3                                37.359049         0.045799  HLA-A  \n",
       "4                                37.359049         0.037029  HLA-B  \n",
       "...                                    ...              ...    ...  \n",
       "6762685                          46.224674         0.056667  HLA-B  \n",
       "6762686                          62.744674         0.040790  HLA-B  \n",
       "6762687                          62.744674         0.042378  HLA-A  \n",
       "6762688                          46.224674         0.047019  HLA-B  \n",
       "6762689                          46.224674         0.051675  HLA-B  \n",
       "\n",
       "[6762690 rows x 6 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mhcflurry_preds = pd.read_csv('mhc1_haplotype_preds_mhcflurry2.0.csv')\n",
    "\n",
    "mhcflurry_preds = mhcflurry_preds.rename(columns={'peptide': 'sequence'})\n",
    "mhcflurry_preds['sequence_length'] = [len(x) for x in mhcflurry_preds['sequence'].values]\n",
    "\n",
    "# Filter dataframe to alleles in hla_alleles.\n",
    "# mhcflurry_preds = mhcflurry_preds.merge(hla_alleles, on='allele')\n",
    "\n",
    "# Add epitope protein data.\n",
    "# mhcflurry_preds = mhcflurry_preds.merge(mhc1_data[['sequence', 'Protein']], on='sequence')\n",
    "\n",
    "# Compute logistic-transformed binding affinity.\n",
    "mhcflurry_preds['transformed_aff'] = 1 - np.log(mhcflurry_preds['mhcflurry_affinity']) / np.log(50000)\n",
    "\n",
    "mhcflurry_preds['loci'] = [x[:5] for x in mhcflurry_preds['allele'].values]\n",
    "mhcflurry_preds.drop(\n",
    "    columns=['sequence_length', 'mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score'],\n",
    "    inplace=True,\n",
    ")\n",
    "\n",
    "mhcflurry_preds.rename(columns={'allele': 'genotype', 'sequence': 'Peptide'}, inplace=True)\n",
    "\n",
    "mhcflurry_preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>loci</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-A</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-C</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genotype</th>\n",
       "      <th>HLA-A01:01</th>\n",
       "      <th>HLA-A01:02</th>\n",
       "      <th>HLA-A01:03</th>\n",
       "      <th>HLA-A01:09</th>\n",
       "      <th>HLA-A01:23</th>\n",
       "      <th>HLA-A02:01</th>\n",
       "      <th>HLA-A02:02</th>\n",
       "      <th>HLA-A02:03</th>\n",
       "      <th>HLA-A02:04</th>\n",
       "      <th>HLA-A02:05</th>\n",
       "      <th>...</th>\n",
       "      <th>HLA-C17:02</th>\n",
       "      <th>HLA-C17:03</th>\n",
       "      <th>HLA-C17:04</th>\n",
       "      <th>HLA-C17:05</th>\n",
       "      <th>HLA-C17:06</th>\n",
       "      <th>HLA-C17:07</th>\n",
       "      <th>HLA-C18</th>\n",
       "      <th>HLA-C18:01</th>\n",
       "      <th>HLA-C18:02</th>\n",
       "      <th>HLA-C18:03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peptide</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAAYYVGY</th>\n",
       "      <td>0.105609</td>\n",
       "      <td>0.136948</td>\n",
       "      <td>0.078258</td>\n",
       "      <td>0.105609</td>\n",
       "      <td>0.099377</td>\n",
       "      <td>0.045679</td>\n",
       "      <td>0.044721</td>\n",
       "      <td>0.043267</td>\n",
       "      <td>0.052031</td>\n",
       "      <td>0.047437</td>\n",
       "      <td>...</td>\n",
       "      <td>0.082833</td>\n",
       "      <td>0.082833</td>\n",
       "      <td>0.082833</td>\n",
       "      <td>0.082833</td>\n",
       "      <td>0.076228</td>\n",
       "      <td>0.069838</td>\n",
       "      <td>0.043940</td>\n",
       "      <td>0.042713</td>\n",
       "      <td>0.042713</td>\n",
       "      <td>0.046394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYL</th>\n",
       "      <td>0.059259</td>\n",
       "      <td>0.106627</td>\n",
       "      <td>0.055272</td>\n",
       "      <td>0.059259</td>\n",
       "      <td>0.059694</td>\n",
       "      <td>0.261589</td>\n",
       "      <td>0.432255</td>\n",
       "      <td>0.372506</td>\n",
       "      <td>0.509595</td>\n",
       "      <td>0.575560</td>\n",
       "      <td>...</td>\n",
       "      <td>0.602506</td>\n",
       "      <td>0.602506</td>\n",
       "      <td>0.602506</td>\n",
       "      <td>0.602506</td>\n",
       "      <td>0.588590</td>\n",
       "      <td>0.515765</td>\n",
       "      <td>0.143171</td>\n",
       "      <td>0.099117</td>\n",
       "      <td>0.099117</td>\n",
       "      <td>0.231281</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYLQ</th>\n",
       "      <td>0.047088</td>\n",
       "      <td>0.058239</td>\n",
       "      <td>0.043649</td>\n",
       "      <td>0.047088</td>\n",
       "      <td>0.045955</td>\n",
       "      <td>0.046259</td>\n",
       "      <td>0.048152</td>\n",
       "      <td>0.043510</td>\n",
       "      <td>0.056553</td>\n",
       "      <td>0.056510</td>\n",
       "      <td>...</td>\n",
       "      <td>0.053779</td>\n",
       "      <td>0.053779</td>\n",
       "      <td>0.053779</td>\n",
       "      <td>0.053779</td>\n",
       "      <td>0.050793</td>\n",
       "      <td>0.051782</td>\n",
       "      <td>0.042645</td>\n",
       "      <td>0.040981</td>\n",
       "      <td>0.040981</td>\n",
       "      <td>0.045974</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAK</th>\n",
       "      <td>0.051374</td>\n",
       "      <td>0.067382</td>\n",
       "      <td>0.045865</td>\n",
       "      <td>0.051374</td>\n",
       "      <td>0.049864</td>\n",
       "      <td>0.050148</td>\n",
       "      <td>0.050080</td>\n",
       "      <td>0.047409</td>\n",
       "      <td>0.062404</td>\n",
       "      <td>0.053535</td>\n",
       "      <td>...</td>\n",
       "      <td>0.109689</td>\n",
       "      <td>0.109689</td>\n",
       "      <td>0.109689</td>\n",
       "      <td>0.109689</td>\n",
       "      <td>0.098246</td>\n",
       "      <td>0.118048</td>\n",
       "      <td>0.055409</td>\n",
       "      <td>0.053855</td>\n",
       "      <td>0.053855</td>\n",
       "      <td>0.058517</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAKA</th>\n",
       "      <td>0.055832</td>\n",
       "      <td>0.079984</td>\n",
       "      <td>0.050194</td>\n",
       "      <td>0.055832</td>\n",
       "      <td>0.053780</td>\n",
       "      <td>0.340709</td>\n",
       "      <td>0.476701</td>\n",
       "      <td>0.502855</td>\n",
       "      <td>0.411340</td>\n",
       "      <td>0.621088</td>\n",
       "      <td>...</td>\n",
       "      <td>0.302690</td>\n",
       "      <td>0.302690</td>\n",
       "      <td>0.302690</td>\n",
       "      <td>0.302690</td>\n",
       "      <td>0.295129</td>\n",
       "      <td>0.251292</td>\n",
       "      <td>0.081719</td>\n",
       "      <td>0.068141</td>\n",
       "      <td>0.068141</td>\n",
       "      <td>0.108877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPR</th>\n",
       "      <td>0.070006</td>\n",
       "      <td>0.147469</td>\n",
       "      <td>0.060451</td>\n",
       "      <td>0.070006</td>\n",
       "      <td>0.073079</td>\n",
       "      <td>0.103479</td>\n",
       "      <td>0.099143</td>\n",
       "      <td>0.092059</td>\n",
       "      <td>0.159730</td>\n",
       "      <td>0.100631</td>\n",
       "      <td>...</td>\n",
       "      <td>0.083839</td>\n",
       "      <td>0.083839</td>\n",
       "      <td>0.083839</td>\n",
       "      <td>0.083839</td>\n",
       "      <td>0.078158</td>\n",
       "      <td>0.089626</td>\n",
       "      <td>0.099271</td>\n",
       "      <td>0.102135</td>\n",
       "      <td>0.102135</td>\n",
       "      <td>0.093543</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPRT</th>\n",
       "      <td>0.038383</td>\n",
       "      <td>0.048676</td>\n",
       "      <td>0.037177</td>\n",
       "      <td>0.038383</td>\n",
       "      <td>0.038131</td>\n",
       "      <td>0.049290</td>\n",
       "      <td>0.050717</td>\n",
       "      <td>0.045845</td>\n",
       "      <td>0.062303</td>\n",
       "      <td>0.051598</td>\n",
       "      <td>...</td>\n",
       "      <td>0.040252</td>\n",
       "      <td>0.040252</td>\n",
       "      <td>0.040252</td>\n",
       "      <td>0.040252</td>\n",
       "      <td>0.040166</td>\n",
       "      <td>0.040705</td>\n",
       "      <td>0.047682</td>\n",
       "      <td>0.047887</td>\n",
       "      <td>0.047887</td>\n",
       "      <td>0.047271</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYV</th>\n",
       "      <td>0.042219</td>\n",
       "      <td>0.052309</td>\n",
       "      <td>0.039432</td>\n",
       "      <td>0.042219</td>\n",
       "      <td>0.041000</td>\n",
       "      <td>0.047750</td>\n",
       "      <td>0.047634</td>\n",
       "      <td>0.043581</td>\n",
       "      <td>0.062600</td>\n",
       "      <td>0.050109</td>\n",
       "      <td>...</td>\n",
       "      <td>0.052355</td>\n",
       "      <td>0.052355</td>\n",
       "      <td>0.052355</td>\n",
       "      <td>0.052355</td>\n",
       "      <td>0.052012</td>\n",
       "      <td>0.051413</td>\n",
       "      <td>0.085485</td>\n",
       "      <td>0.098521</td>\n",
       "      <td>0.098521</td>\n",
       "      <td>0.059411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVH</th>\n",
       "      <td>0.062425</td>\n",
       "      <td>0.135287</td>\n",
       "      <td>0.054883</td>\n",
       "      <td>0.062425</td>\n",
       "      <td>0.062163</td>\n",
       "      <td>0.048823</td>\n",
       "      <td>0.046424</td>\n",
       "      <td>0.043007</td>\n",
       "      <td>0.058425</td>\n",
       "      <td>0.045571</td>\n",
       "      <td>...</td>\n",
       "      <td>0.044454</td>\n",
       "      <td>0.044454</td>\n",
       "      <td>0.044454</td>\n",
       "      <td>0.044454</td>\n",
       "      <td>0.043531</td>\n",
       "      <td>0.045259</td>\n",
       "      <td>0.054809</td>\n",
       "      <td>0.058303</td>\n",
       "      <td>0.058303</td>\n",
       "      <td>0.047823</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVHV</th>\n",
       "      <td>0.041246</td>\n",
       "      <td>0.060453</td>\n",
       "      <td>0.039802</td>\n",
       "      <td>0.041246</td>\n",
       "      <td>0.040743</td>\n",
       "      <td>0.084248</td>\n",
       "      <td>0.106468</td>\n",
       "      <td>0.080737</td>\n",
       "      <td>0.127988</td>\n",
       "      <td>0.120318</td>\n",
       "      <td>...</td>\n",
       "      <td>0.049998</td>\n",
       "      <td>0.049998</td>\n",
       "      <td>0.049998</td>\n",
       "      <td>0.049998</td>\n",
       "      <td>0.049092</td>\n",
       "      <td>0.047280</td>\n",
       "      <td>0.072224</td>\n",
       "      <td>0.077754</td>\n",
       "      <td>0.077754</td>\n",
       "      <td>0.061165</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29403 rows × 233 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "loci            HLA-A                                                         \\\n",
       "genotype   HLA-A01:01 HLA-A01:02 HLA-A01:03 HLA-A01:09 HLA-A01:23 HLA-A02:01   \n",
       "Peptide                                                                        \n",
       "AAAYYVGY     0.105609   0.136948   0.078258   0.105609   0.099377   0.045679   \n",
       "AAAYYVGYL    0.059259   0.106627   0.055272   0.059259   0.059694   0.261589   \n",
       "AAAYYVGYLQ   0.047088   0.058239   0.043649   0.047088   0.045955   0.046259   \n",
       "AACCHLAK     0.051374   0.067382   0.045865   0.051374   0.049864   0.050148   \n",
       "AACCHLAKA    0.055832   0.079984   0.050194   0.055832   0.053780   0.340709   \n",
       "...               ...        ...        ...        ...        ...        ...   \n",
       "YYVGYLQPR    0.070006   0.147469   0.060451   0.070006   0.073079   0.103479   \n",
       "YYVGYLQPRT   0.038383   0.048676   0.037177   0.038383   0.038131   0.049290   \n",
       "YYVWKSYV     0.042219   0.052309   0.039432   0.042219   0.041000   0.047750   \n",
       "YYVWKSYVH    0.062425   0.135287   0.054883   0.062425   0.062163   0.048823   \n",
       "YYVWKSYVHV   0.041246   0.060453   0.039802   0.041246   0.040743   0.084248   \n",
       "\n",
       "loci                                                    ...      HLA-C  \\\n",
       "genotype   HLA-A02:02 HLA-A02:03 HLA-A02:04 HLA-A02:05  ... HLA-C17:02   \n",
       "Peptide                                                 ...              \n",
       "AAAYYVGY     0.044721   0.043267   0.052031   0.047437  ...   0.082833   \n",
       "AAAYYVGYL    0.432255   0.372506   0.509595   0.575560  ...   0.602506   \n",
       "AAAYYVGYLQ   0.048152   0.043510   0.056553   0.056510  ...   0.053779   \n",
       "AACCHLAK     0.050080   0.047409   0.062404   0.053535  ...   0.109689   \n",
       "AACCHLAKA    0.476701   0.502855   0.411340   0.621088  ...   0.302690   \n",
       "...               ...        ...        ...        ...  ...        ...   \n",
       "YYVGYLQPR    0.099143   0.092059   0.159730   0.100631  ...   0.083839   \n",
       "YYVGYLQPRT   0.050717   0.045845   0.062303   0.051598  ...   0.040252   \n",
       "YYVWKSYV     0.047634   0.043581   0.062600   0.050109  ...   0.052355   \n",
       "YYVWKSYVH    0.046424   0.043007   0.058425   0.045571  ...   0.044454   \n",
       "YYVWKSYVHV   0.106468   0.080737   0.127988   0.120318  ...   0.049998   \n",
       "\n",
       "loci                                                                         \\\n",
       "genotype   HLA-C17:03 HLA-C17:04 HLA-C17:05 HLA-C17:06 HLA-C17:07   HLA-C18   \n",
       "Peptide                                                                       \n",
       "AAAYYVGY     0.082833   0.082833   0.082833   0.076228   0.069838  0.043940   \n",
       "AAAYYVGYL    0.602506   0.602506   0.602506   0.588590   0.515765  0.143171   \n",
       "AAAYYVGYLQ   0.053779   0.053779   0.053779   0.050793   0.051782  0.042645   \n",
       "AACCHLAK     0.109689   0.109689   0.109689   0.098246   0.118048  0.055409   \n",
       "AACCHLAKA    0.302690   0.302690   0.302690   0.295129   0.251292  0.081719   \n",
       "...               ...        ...        ...        ...        ...       ...   \n",
       "YYVGYLQPR    0.083839   0.083839   0.083839   0.078158   0.089626  0.099271   \n",
       "YYVGYLQPRT   0.040252   0.040252   0.040252   0.040166   0.040705  0.047682   \n",
       "YYVWKSYV     0.052355   0.052355   0.052355   0.052012   0.051413  0.085485   \n",
       "YYVWKSYVH    0.044454   0.044454   0.044454   0.043531   0.045259  0.054809   \n",
       "YYVWKSYVHV   0.049998   0.049998   0.049998   0.049092   0.047280  0.072224   \n",
       "\n",
       "loci                                         \n",
       "genotype   HLA-C18:01 HLA-C18:02 HLA-C18:03  \n",
       "Peptide                                      \n",
       "AAAYYVGY     0.042713   0.042713   0.046394  \n",
       "AAAYYVGYL    0.099117   0.099117   0.231281  \n",
       "AAAYYVGYLQ   0.040981   0.040981   0.045974  \n",
       "AACCHLAK     0.053855   0.053855   0.058517  \n",
       "AACCHLAKA    0.068141   0.068141   0.108877  \n",
       "...               ...        ...        ...  \n",
       "YYVGYLQPR    0.102135   0.102135   0.093543  \n",
       "YYVGYLQPRT   0.047887   0.047887   0.047271  \n",
       "YYVWKSYV     0.098521   0.098521   0.059411  \n",
       "YYVWKSYVH    0.058303   0.058303   0.047823  \n",
       "YYVWKSYVHV   0.077754   0.077754   0.061165  \n",
       "\n",
       "[29403 rows x 233 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = mhcflurry_preds\n",
    "\n",
    "a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])\n",
    "a74['loci'] = 'HLA-A'\n",
    "a74['genotype'] = 'HLA-A74'\n",
    "#a74\n",
    "\n",
    "c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()\n",
    "c17['loci'] = 'HLA-C'\n",
    "c17['genotype'] = 'HLA-C17'\n",
    "#c17\n",
    "\n",
    "c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()\n",
    "c18['loci'] = 'HLA-C'\n",
    "c18['genotype'] = 'HLA-C18'\n",
    "#c18\n",
    "\n",
    "data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(\n",
    "    index='Peptide',\n",
    "    columns=['loci', 'genotype'],\n",
    "    values='transformed_aff',\n",
    ")\n",
    "data_pivot.to_pickle('mhc1_haplotype_mhcflurry2.0_pred_affinity_pivot.pkl.gz', protocol=2)\n",
    "data_pivot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mean Ensemble from NetMHC and MHCflurry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Peptide</th>\n",
       "      <th>netmhc_nM</th>\n",
       "      <th>genotype</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>40836.4258</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>YSFVSEET</td>\n",
       "      <td>43144.7188</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SFVSEETG</td>\n",
       "      <td>44191.7070</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>FVSEETGT</td>\n",
       "      <td>46105.8516</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>VSEETGTL</td>\n",
       "      <td>42674.8945</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29398</th>\n",
       "      <td>DSEPVLKGVK</td>\n",
       "      <td>38063.9297</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29399</th>\n",
       "      <td>SEPVLKGVKL</td>\n",
       "      <td>1738.0863</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29400</th>\n",
       "      <td>EPVLKGVKLH</td>\n",
       "      <td>16034.5420</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29401</th>\n",
       "      <td>PVLKGVKLHY</td>\n",
       "      <td>39286.5586</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29402</th>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>37267.6289</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          Peptide   netmhc_nM    genotype\n",
       "0        MYSFVSEE  40836.4258  HLA-B44:04\n",
       "1        YSFVSEET  43144.7188  HLA-B44:04\n",
       "2        SFVSEETG  44191.7070  HLA-B44:04\n",
       "3        FVSEETGT  46105.8516  HLA-B44:04\n",
       "4        VSEETGTL  42674.8945  HLA-B44:04\n",
       "...           ...         ...         ...\n",
       "29398  DSEPVLKGVK  38063.9297  HLA-B56:10\n",
       "29399  SEPVLKGVKL   1738.0863  HLA-B56:10\n",
       "29400  EPVLKGVKLH  16034.5420  HLA-B56:10\n",
       "29401  PVLKGVKLHY  39286.5586  HLA-B56:10\n",
       "29402  VLKGVKLHYT  37267.6289  HLA-B56:10\n",
       "\n",
       "[6762690 rows x 3 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ens_netmhc1 = netmhc1_data.copy().drop(columns=['sequence_length', 'loci', '1-log50k'])\n",
    "ens_netmhc1 = ens_netmhc1.rename(columns={'nM': 'netmhc_nM'})\n",
    "ens_netmhc1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genotype</th>\n",
       "      <th>Peptide</th>\n",
       "      <th>mhcflurry_nM</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>37829.751440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>HLA-B44:05</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38198.075124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HLA-B44:07</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38064.733103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>HLA-A30:10</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>34529.712035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HLA-B44:02</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>38191.475075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762685</th>\n",
       "      <td>HLA-B55:02</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>29249.898280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762686</th>\n",
       "      <td>HLA-B67:01</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>34574.906172</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762687</th>\n",
       "      <td>HLA-A24:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>35345.127043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762688</th>\n",
       "      <td>HLA-B15:32</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>32874.896891</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762689</th>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>27169.830276</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           genotype     Peptide  mhcflurry_nM\n",
       "0        HLA-B44:04    MYSFVSEE  37829.751440\n",
       "1        HLA-B44:05    MYSFVSEE  38198.075124\n",
       "2        HLA-B44:07    MYSFVSEE  38064.733103\n",
       "3        HLA-A30:10    MYSFVSEE  34529.712035\n",
       "4        HLA-B44:02    MYSFVSEE  38191.475075\n",
       "...             ...         ...           ...\n",
       "6762685  HLA-B55:02  VLKGVKLHYT  29249.898280\n",
       "6762686  HLA-B67:01  VLKGVKLHYT  34574.906172\n",
       "6762687  HLA-A24:10  VLKGVKLHYT  35345.127043\n",
       "6762688  HLA-B15:32  VLKGVKLHYT  32874.896891\n",
       "6762689  HLA-B56:10  VLKGVKLHYT  27169.830276\n",
       "\n",
       "[6762690 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ens_mhcflurry = mhcflurry_preds.copy().drop(\n",
    "    columns=['mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score',\n",
    "             'transformed_aff', 'sequence_length'])\n",
    "ens_mhcflurry = ens_mhcflurry.rename(\n",
    "    columns={'allele': 'genotype', 'sequence': 'Peptide', 'mhcflurry_affinity': 'mhcflurry_nM'})\n",
    "ens_mhcflurry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Peptide</th>\n",
       "      <th>netmhc_nM</th>\n",
       "      <th>genotype</th>\n",
       "      <th>mhcflurry_nM</th>\n",
       "      <th>mean_nM</th>\n",
       "      <th>max_nM</th>\n",
       "      <th>mean_transformed</th>\n",
       "      <th>max_transformed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>40836.4258</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>37829.751440</td>\n",
       "      <td>39333.088620</td>\n",
       "      <td>40836.425800</td>\n",
       "      <td>0.022178</td>\n",
       "      <td>0.018711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>YSFVSEET</td>\n",
       "      <td>43144.7188</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>35954.932789</td>\n",
       "      <td>39549.825795</td>\n",
       "      <td>43144.718800</td>\n",
       "      <td>0.021670</td>\n",
       "      <td>0.013629</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SFVSEETG</td>\n",
       "      <td>44191.7070</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>38013.397725</td>\n",
       "      <td>41102.552363</td>\n",
       "      <td>44191.707000</td>\n",
       "      <td>0.018111</td>\n",
       "      <td>0.011413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>FVSEETGT</td>\n",
       "      <td>46105.8516</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>37585.110542</td>\n",
       "      <td>41845.481071</td>\n",
       "      <td>46105.851600</td>\n",
       "      <td>0.016455</td>\n",
       "      <td>0.007494</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>VSEETGTL</td>\n",
       "      <td>42674.8945</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>32961.628791</td>\n",
       "      <td>37818.261646</td>\n",
       "      <td>42674.894500</td>\n",
       "      <td>0.025807</td>\n",
       "      <td>0.014641</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762685</th>\n",
       "      <td>DSEPVLKGVK</td>\n",
       "      <td>38063.9297</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>11957.159303</td>\n",
       "      <td>25010.544502</td>\n",
       "      <td>38063.929700</td>\n",
       "      <td>0.064024</td>\n",
       "      <td>0.025209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762686</th>\n",
       "      <td>SEPVLKGVKL</td>\n",
       "      <td>1738.0863</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>2608.759444</td>\n",
       "      <td>2173.422872</td>\n",
       "      <td>2608.759444</td>\n",
       "      <td>0.289814</td>\n",
       "      <td>0.272940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762687</th>\n",
       "      <td>EPVLKGVKLH</td>\n",
       "      <td>16034.5420</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>12895.423658</td>\n",
       "      <td>14464.982829</td>\n",
       "      <td>16034.542000</td>\n",
       "      <td>0.114632</td>\n",
       "      <td>0.105111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762688</th>\n",
       "      <td>PVLKGVKLHY</td>\n",
       "      <td>39286.5586</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>25858.490129</td>\n",
       "      <td>32572.524364</td>\n",
       "      <td>39286.558600</td>\n",
       "      <td>0.039608</td>\n",
       "      <td>0.022287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6762689</th>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>37267.6289</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>27169.830276</td>\n",
       "      <td>32218.729588</td>\n",
       "      <td>37267.628900</td>\n",
       "      <td>0.040618</td>\n",
       "      <td>0.027163</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6762690 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            Peptide   netmhc_nM    genotype  mhcflurry_nM       mean_nM  \\\n",
       "0          MYSFVSEE  40836.4258  HLA-B44:04  37829.751440  39333.088620   \n",
       "1          YSFVSEET  43144.7188  HLA-B44:04  35954.932789  39549.825795   \n",
       "2          SFVSEETG  44191.7070  HLA-B44:04  38013.397725  41102.552363   \n",
       "3          FVSEETGT  46105.8516  HLA-B44:04  37585.110542  41845.481071   \n",
       "4          VSEETGTL  42674.8945  HLA-B44:04  32961.628791  37818.261646   \n",
       "...             ...         ...         ...           ...           ...   \n",
       "6762685  DSEPVLKGVK  38063.9297  HLA-B56:10  11957.159303  25010.544502   \n",
       "6762686  SEPVLKGVKL   1738.0863  HLA-B56:10   2608.759444   2173.422872   \n",
       "6762687  EPVLKGVKLH  16034.5420  HLA-B56:10  12895.423658  14464.982829   \n",
       "6762688  PVLKGVKLHY  39286.5586  HLA-B56:10  25858.490129  32572.524364   \n",
       "6762689  VLKGVKLHYT  37267.6289  HLA-B56:10  27169.830276  32218.729588   \n",
       "\n",
       "               max_nM  mean_transformed  max_transformed  \n",
       "0        40836.425800          0.022178         0.018711  \n",
       "1        43144.718800          0.021670         0.013629  \n",
       "2        44191.707000          0.018111         0.011413  \n",
       "3        46105.851600          0.016455         0.007494  \n",
       "4        42674.894500          0.025807         0.014641  \n",
       "...               ...               ...              ...  \n",
       "6762685  38063.929700          0.064024         0.025209  \n",
       "6762686   2608.759444          0.289814         0.272940  \n",
       "6762687  16034.542000          0.114632         0.105111  \n",
       "6762688  39286.558600          0.039608         0.022287  \n",
       "6762689  37267.628900          0.040618         0.027163  \n",
       "\n",
       "[6762690 rows x 8 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ens_combined = ens_netmhc1.merge(ens_mhcflurry, on=['genotype', 'Peptide'], how='inner')\n",
    "ens_combined['loci'] = [x[:5] for x in ens_combined['genotype'].values]\n",
    "ens_combined['mean_nM'] = (ens_combined['netmhc_nM'] + ens_combined['mhcflurry_nM']) / 2\n",
    "ens_combined['max_nM'] = ens_combined[['netmhc_nM', 'mhcflurry_nM']].max(axis=1)\n",
    "ens_combined['mean_transformed'] = [transform_affinity(x) for x in ens_combined['mean_nM'].values]\n",
    "ens_combined['max_transformed'] = [transform_affinity(x) for x in ens_combined['max_nM'].values]\n",
    "ens_combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total pMHC pairs:  6762690\n",
      "# NetMHC binders:  12497\n",
      "# MHCflurry binders:  38845\n",
      "# Mean Ens. binders:  10036\n",
      "# Max Ens. binders:  7528\n"
     ]
    }
   ],
   "source": [
    "print('Total pMHC pairs: ', len(ens_combined))\n",
    "print('# NetMHC binders: ', (ens_combined['netmhc_nM'].values <= 50).sum())\n",
    "print('# MHCflurry binders: ', (ens_combined['mhcflurry_nM'].values <= 50).sum())\n",
    "print('# Mean Ens. binders: ', (ens_combined['mean_nM'].values <= 50).sum())\n",
    "print('# Max Ens. binders: ', (ens_combined['max_nM'].values <= 50).sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6384377847127609\n"
     ]
    }
   ],
   "source": [
    "print(transform_affinity(50))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.04171642418454635"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transform_affinity(31838)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>loci</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-A</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-C</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genotype</th>\n",
       "      <th>HLA-A01:01</th>\n",
       "      <th>HLA-A01:02</th>\n",
       "      <th>HLA-A01:03</th>\n",
       "      <th>HLA-A01:09</th>\n",
       "      <th>HLA-A01:23</th>\n",
       "      <th>HLA-A02:01</th>\n",
       "      <th>HLA-A02:02</th>\n",
       "      <th>HLA-A02:03</th>\n",
       "      <th>HLA-A02:04</th>\n",
       "      <th>HLA-A02:05</th>\n",
       "      <th>...</th>\n",
       "      <th>HLA-C17:02</th>\n",
       "      <th>HLA-C17:03</th>\n",
       "      <th>HLA-C17:04</th>\n",
       "      <th>HLA-C17:05</th>\n",
       "      <th>HLA-C17:06</th>\n",
       "      <th>HLA-C17:07</th>\n",
       "      <th>HLA-C18</th>\n",
       "      <th>HLA-C18:01</th>\n",
       "      <th>HLA-C18:02</th>\n",
       "      <th>HLA-C18:03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peptide</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAAYYVGY</th>\n",
       "      <td>0.099904</td>\n",
       "      <td>0.119730</td>\n",
       "      <td>0.075010</td>\n",
       "      <td>0.099904</td>\n",
       "      <td>0.103053</td>\n",
       "      <td>0.030960</td>\n",
       "      <td>0.040544</td>\n",
       "      <td>0.045232</td>\n",
       "      <td>0.029476</td>\n",
       "      <td>0.047374</td>\n",
       "      <td>...</td>\n",
       "      <td>0.042643</td>\n",
       "      <td>0.042643</td>\n",
       "      <td>0.042643</td>\n",
       "      <td>0.042643</td>\n",
       "      <td>0.039200</td>\n",
       "      <td>0.033960</td>\n",
       "      <td>0.023998</td>\n",
       "      <td>0.024134</td>\n",
       "      <td>0.024134</td>\n",
       "      <td>0.023725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYL</th>\n",
       "      <td>0.062267</td>\n",
       "      <td>0.085513</td>\n",
       "      <td>0.055643</td>\n",
       "      <td>0.062267</td>\n",
       "      <td>0.064940</td>\n",
       "      <td>0.237727</td>\n",
       "      <td>0.430435</td>\n",
       "      <td>0.369027</td>\n",
       "      <td>0.224186</td>\n",
       "      <td>0.527945</td>\n",
       "      <td>...</td>\n",
       "      <td>0.401438</td>\n",
       "      <td>0.401438</td>\n",
       "      <td>0.401438</td>\n",
       "      <td>0.401438</td>\n",
       "      <td>0.400499</td>\n",
       "      <td>0.354651</td>\n",
       "      <td>0.098521</td>\n",
       "      <td>0.082455</td>\n",
       "      <td>0.082455</td>\n",
       "      <td>0.130655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYLQ</th>\n",
       "      <td>0.044230</td>\n",
       "      <td>0.065236</td>\n",
       "      <td>0.037418</td>\n",
       "      <td>0.044230</td>\n",
       "      <td>0.042310</td>\n",
       "      <td>0.049180</td>\n",
       "      <td>0.082729</td>\n",
       "      <td>0.079332</td>\n",
       "      <td>0.041714</td>\n",
       "      <td>0.088255</td>\n",
       "      <td>...</td>\n",
       "      <td>0.044212</td>\n",
       "      <td>0.044212</td>\n",
       "      <td>0.044212</td>\n",
       "      <td>0.044212</td>\n",
       "      <td>0.042455</td>\n",
       "      <td>0.038649</td>\n",
       "      <td>0.026250</td>\n",
       "      <td>0.024935</td>\n",
       "      <td>0.024935</td>\n",
       "      <td>0.028880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAK</th>\n",
       "      <td>0.036249</td>\n",
       "      <td>0.058857</td>\n",
       "      <td>0.029827</td>\n",
       "      <td>0.036249</td>\n",
       "      <td>0.036305</td>\n",
       "      <td>0.034517</td>\n",
       "      <td>0.034367</td>\n",
       "      <td>0.034648</td>\n",
       "      <td>0.034952</td>\n",
       "      <td>0.040558</td>\n",
       "      <td>...</td>\n",
       "      <td>0.045251</td>\n",
       "      <td>0.045251</td>\n",
       "      <td>0.045251</td>\n",
       "      <td>0.045251</td>\n",
       "      <td>0.040600</td>\n",
       "      <td>0.044300</td>\n",
       "      <td>0.029786</td>\n",
       "      <td>0.029516</td>\n",
       "      <td>0.029516</td>\n",
       "      <td>0.030327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAKA</th>\n",
       "      <td>0.046042</td>\n",
       "      <td>0.069239</td>\n",
       "      <td>0.038753</td>\n",
       "      <td>0.046042</td>\n",
       "      <td>0.046332</td>\n",
       "      <td>0.222861</td>\n",
       "      <td>0.308355</td>\n",
       "      <td>0.385027</td>\n",
       "      <td>0.152427</td>\n",
       "      <td>0.398331</td>\n",
       "      <td>...</td>\n",
       "      <td>0.124742</td>\n",
       "      <td>0.124742</td>\n",
       "      <td>0.124742</td>\n",
       "      <td>0.124742</td>\n",
       "      <td>0.122662</td>\n",
       "      <td>0.095026</td>\n",
       "      <td>0.047754</td>\n",
       "      <td>0.042288</td>\n",
       "      <td>0.042288</td>\n",
       "      <td>0.058685</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPR</th>\n",
       "      <td>0.040328</td>\n",
       "      <td>0.090958</td>\n",
       "      <td>0.035315</td>\n",
       "      <td>0.040328</td>\n",
       "      <td>0.045142</td>\n",
       "      <td>0.093471</td>\n",
       "      <td>0.103548</td>\n",
       "      <td>0.091099</td>\n",
       "      <td>0.117097</td>\n",
       "      <td>0.118416</td>\n",
       "      <td>...</td>\n",
       "      <td>0.044245</td>\n",
       "      <td>0.044245</td>\n",
       "      <td>0.044245</td>\n",
       "      <td>0.044245</td>\n",
       "      <td>0.042114</td>\n",
       "      <td>0.042946</td>\n",
       "      <td>0.064351</td>\n",
       "      <td>0.060302</td>\n",
       "      <td>0.060302</td>\n",
       "      <td>0.072448</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPRT</th>\n",
       "      <td>0.028263</td>\n",
       "      <td>0.044036</td>\n",
       "      <td>0.025075</td>\n",
       "      <td>0.028263</td>\n",
       "      <td>0.029381</td>\n",
       "      <td>0.066425</td>\n",
       "      <td>0.084018</td>\n",
       "      <td>0.079199</td>\n",
       "      <td>0.070191</td>\n",
       "      <td>0.094782</td>\n",
       "      <td>...</td>\n",
       "      <td>0.027799</td>\n",
       "      <td>0.027799</td>\n",
       "      <td>0.027799</td>\n",
       "      <td>0.027799</td>\n",
       "      <td>0.027483</td>\n",
       "      <td>0.023921</td>\n",
       "      <td>0.038295</td>\n",
       "      <td>0.036115</td>\n",
       "      <td>0.036115</td>\n",
       "      <td>0.042655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYV</th>\n",
       "      <td>0.031677</td>\n",
       "      <td>0.045735</td>\n",
       "      <td>0.029042</td>\n",
       "      <td>0.031677</td>\n",
       "      <td>0.032530</td>\n",
       "      <td>0.061548</td>\n",
       "      <td>0.070117</td>\n",
       "      <td>0.073983</td>\n",
       "      <td>0.064317</td>\n",
       "      <td>0.069071</td>\n",
       "      <td>...</td>\n",
       "      <td>0.045366</td>\n",
       "      <td>0.045366</td>\n",
       "      <td>0.045366</td>\n",
       "      <td>0.045366</td>\n",
       "      <td>0.044032</td>\n",
       "      <td>0.039757</td>\n",
       "      <td>0.082259</td>\n",
       "      <td>0.088021</td>\n",
       "      <td>0.088021</td>\n",
       "      <td>0.070733</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVH</th>\n",
       "      <td>0.046323</td>\n",
       "      <td>0.088608</td>\n",
       "      <td>0.039156</td>\n",
       "      <td>0.046323</td>\n",
       "      <td>0.049080</td>\n",
       "      <td>0.031857</td>\n",
       "      <td>0.034921</td>\n",
       "      <td>0.028971</td>\n",
       "      <td>0.033717</td>\n",
       "      <td>0.039375</td>\n",
       "      <td>...</td>\n",
       "      <td>0.024933</td>\n",
       "      <td>0.024933</td>\n",
       "      <td>0.024933</td>\n",
       "      <td>0.024933</td>\n",
       "      <td>0.024360</td>\n",
       "      <td>0.024976</td>\n",
       "      <td>0.045196</td>\n",
       "      <td>0.047047</td>\n",
       "      <td>0.047047</td>\n",
       "      <td>0.041493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVHV</th>\n",
       "      <td>0.041707</td>\n",
       "      <td>0.063912</td>\n",
       "      <td>0.038840</td>\n",
       "      <td>0.041707</td>\n",
       "      <td>0.045612</td>\n",
       "      <td>0.137153</td>\n",
       "      <td>0.146166</td>\n",
       "      <td>0.121292</td>\n",
       "      <td>0.172445</td>\n",
       "      <td>0.154587</td>\n",
       "      <td>...</td>\n",
       "      <td>0.065425</td>\n",
       "      <td>0.065425</td>\n",
       "      <td>0.065425</td>\n",
       "      <td>0.065425</td>\n",
       "      <td>0.063980</td>\n",
       "      <td>0.055888</td>\n",
       "      <td>0.076459</td>\n",
       "      <td>0.075275</td>\n",
       "      <td>0.075275</td>\n",
       "      <td>0.078827</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29403 rows × 233 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "loci            HLA-A                                                         \\\n",
       "genotype   HLA-A01:01 HLA-A01:02 HLA-A01:03 HLA-A01:09 HLA-A01:23 HLA-A02:01   \n",
       "Peptide                                                                        \n",
       "AAAYYVGY     0.099904   0.119730   0.075010   0.099904   0.103053   0.030960   \n",
       "AAAYYVGYL    0.062267   0.085513   0.055643   0.062267   0.064940   0.237727   \n",
       "AAAYYVGYLQ   0.044230   0.065236   0.037418   0.044230   0.042310   0.049180   \n",
       "AACCHLAK     0.036249   0.058857   0.029827   0.036249   0.036305   0.034517   \n",
       "AACCHLAKA    0.046042   0.069239   0.038753   0.046042   0.046332   0.222861   \n",
       "...               ...        ...        ...        ...        ...        ...   \n",
       "YYVGYLQPR    0.040328   0.090958   0.035315   0.040328   0.045142   0.093471   \n",
       "YYVGYLQPRT   0.028263   0.044036   0.025075   0.028263   0.029381   0.066425   \n",
       "YYVWKSYV     0.031677   0.045735   0.029042   0.031677   0.032530   0.061548   \n",
       "YYVWKSYVH    0.046323   0.088608   0.039156   0.046323   0.049080   0.031857   \n",
       "YYVWKSYVHV   0.041707   0.063912   0.038840   0.041707   0.045612   0.137153   \n",
       "\n",
       "loci                                                    ...      HLA-C  \\\n",
       "genotype   HLA-A02:02 HLA-A02:03 HLA-A02:04 HLA-A02:05  ... HLA-C17:02   \n",
       "Peptide                                                 ...              \n",
       "AAAYYVGY     0.040544   0.045232   0.029476   0.047374  ...   0.042643   \n",
       "AAAYYVGYL    0.430435   0.369027   0.224186   0.527945  ...   0.401438   \n",
       "AAAYYVGYLQ   0.082729   0.079332   0.041714   0.088255  ...   0.044212   \n",
       "AACCHLAK     0.034367   0.034648   0.034952   0.040558  ...   0.045251   \n",
       "AACCHLAKA    0.308355   0.385027   0.152427   0.398331  ...   0.124742   \n",
       "...               ...        ...        ...        ...  ...        ...   \n",
       "YYVGYLQPR    0.103548   0.091099   0.117097   0.118416  ...   0.044245   \n",
       "YYVGYLQPRT   0.084018   0.079199   0.070191   0.094782  ...   0.027799   \n",
       "YYVWKSYV     0.070117   0.073983   0.064317   0.069071  ...   0.045366   \n",
       "YYVWKSYVH    0.034921   0.028971   0.033717   0.039375  ...   0.024933   \n",
       "YYVWKSYVHV   0.146166   0.121292   0.172445   0.154587  ...   0.065425   \n",
       "\n",
       "loci                                                                         \\\n",
       "genotype   HLA-C17:03 HLA-C17:04 HLA-C17:05 HLA-C17:06 HLA-C17:07   HLA-C18   \n",
       "Peptide                                                                       \n",
       "AAAYYVGY     0.042643   0.042643   0.042643   0.039200   0.033960  0.023998   \n",
       "AAAYYVGYL    0.401438   0.401438   0.401438   0.400499   0.354651  0.098521   \n",
       "AAAYYVGYLQ   0.044212   0.044212   0.044212   0.042455   0.038649  0.026250   \n",
       "AACCHLAK     0.045251   0.045251   0.045251   0.040600   0.044300  0.029786   \n",
       "AACCHLAKA    0.124742   0.124742   0.124742   0.122662   0.095026  0.047754   \n",
       "...               ...        ...        ...        ...        ...       ...   \n",
       "YYVGYLQPR    0.044245   0.044245   0.044245   0.042114   0.042946  0.064351   \n",
       "YYVGYLQPRT   0.027799   0.027799   0.027799   0.027483   0.023921  0.038295   \n",
       "YYVWKSYV     0.045366   0.045366   0.045366   0.044032   0.039757  0.082259   \n",
       "YYVWKSYVH    0.024933   0.024933   0.024933   0.024360   0.024976  0.045196   \n",
       "YYVWKSYVHV   0.065425   0.065425   0.065425   0.063980   0.055888  0.076459   \n",
       "\n",
       "loci                                         \n",
       "genotype   HLA-C18:01 HLA-C18:02 HLA-C18:03  \n",
       "Peptide                                      \n",
       "AAAYYVGY     0.024134   0.024134   0.023725  \n",
       "AAAYYVGYL    0.082455   0.082455   0.130655  \n",
       "AAAYYVGYLQ   0.024935   0.024935   0.028880  \n",
       "AACCHLAK     0.029516   0.029516   0.030327  \n",
       "AACCHLAKA    0.042288   0.042288   0.058685  \n",
       "...               ...        ...        ...  \n",
       "YYVGYLQPR    0.060302   0.060302   0.072448  \n",
       "YYVGYLQPRT   0.036115   0.036115   0.042655  \n",
       "YYVWKSYV     0.088021   0.088021   0.070733  \n",
       "YYVWKSYVH    0.047047   0.047047   0.041493  \n",
       "YYVWKSYVHV   0.075275   0.075275   0.078827  \n",
       "\n",
       "[29403 rows x 233 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = ens_combined\n",
    "\n",
    "a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])\n",
    "a74['loci'] = 'HLA-A'\n",
    "a74['genotype'] = 'HLA-A74'\n",
    "#a74\n",
    "\n",
    "c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()\n",
    "c17['loci'] = 'HLA-C'\n",
    "c17['genotype'] = 'HLA-C17'\n",
    "#c17\n",
    "\n",
    "c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()\n",
    "c18['loci'] = 'HLA-C'\n",
    "c18['genotype'] = 'HLA-C18'\n",
    "#c18\n",
    "\n",
    "data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(\n",
    "    index='Peptide',\n",
    "    columns=['loci', 'genotype'],\n",
    "    values='mean_transformed',\n",
    ")\n",
    "data_pivot.to_pickle('mhc1_haplotype_mean-ensemble_pivot.pkl.gz', protocol=2)\n",
    "data_pivot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>loci</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-A</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-C</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genotype</th>\n",
       "      <th>HLA-A01:01</th>\n",
       "      <th>HLA-A01:02</th>\n",
       "      <th>HLA-A01:03</th>\n",
       "      <th>HLA-A01:09</th>\n",
       "      <th>HLA-A01:23</th>\n",
       "      <th>HLA-A02:01</th>\n",
       "      <th>HLA-A02:02</th>\n",
       "      <th>HLA-A02:03</th>\n",
       "      <th>HLA-A02:04</th>\n",
       "      <th>HLA-A02:05</th>\n",
       "      <th>...</th>\n",
       "      <th>HLA-C17:02</th>\n",
       "      <th>HLA-C17:03</th>\n",
       "      <th>HLA-C17:04</th>\n",
       "      <th>HLA-C17:05</th>\n",
       "      <th>HLA-C17:06</th>\n",
       "      <th>HLA-C17:07</th>\n",
       "      <th>HLA-C18</th>\n",
       "      <th>HLA-C18:01</th>\n",
       "      <th>HLA-C18:02</th>\n",
       "      <th>HLA-C18:03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peptide</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAAYYVGY</th>\n",
       "      <td>0.094719</td>\n",
       "      <td>0.096633</td>\n",
       "      <td>0.065167</td>\n",
       "      <td>0.094719</td>\n",
       "      <td>0.096370</td>\n",
       "      <td>0.029052</td>\n",
       "      <td>0.033360</td>\n",
       "      <td>0.031451</td>\n",
       "      <td>0.020923</td>\n",
       "      <td>0.036177</td>\n",
       "      <td>...</td>\n",
       "      <td>0.029064</td>\n",
       "      <td>0.029064</td>\n",
       "      <td>0.029064</td>\n",
       "      <td>0.029064</td>\n",
       "      <td>0.029064</td>\n",
       "      <td>0.024008</td>\n",
       "      <td>0.015958</td>\n",
       "      <td>0.016639</td>\n",
       "      <td>0.016639</td>\n",
       "      <td>0.014597</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYL</th>\n",
       "      <td>0.051921</td>\n",
       "      <td>0.079042</td>\n",
       "      <td>0.045578</td>\n",
       "      <td>0.051921</td>\n",
       "      <td>0.052842</td>\n",
       "      <td>0.227627</td>\n",
       "      <td>0.398928</td>\n",
       "      <td>0.321426</td>\n",
       "      <td>0.162347</td>\n",
       "      <td>0.512131</td>\n",
       "      <td>...</td>\n",
       "      <td>0.340147</td>\n",
       "      <td>0.340147</td>\n",
       "      <td>0.340147</td>\n",
       "      <td>0.340147</td>\n",
       "      <td>0.340147</td>\n",
       "      <td>0.297585</td>\n",
       "      <td>0.079491</td>\n",
       "      <td>0.068130</td>\n",
       "      <td>0.068130</td>\n",
       "      <td>0.102213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYLQ</th>\n",
       "      <td>0.034828</td>\n",
       "      <td>0.045169</td>\n",
       "      <td>0.032080</td>\n",
       "      <td>0.034828</td>\n",
       "      <td>0.034498</td>\n",
       "      <td>0.035324</td>\n",
       "      <td>0.036939</td>\n",
       "      <td>0.033438</td>\n",
       "      <td>0.038991</td>\n",
       "      <td>0.040745</td>\n",
       "      <td>...</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.038188</td>\n",
       "      <td>0.034922</td>\n",
       "      <td>0.034588</td>\n",
       "      <td>0.020381</td>\n",
       "      <td>0.018729</td>\n",
       "      <td>0.018729</td>\n",
       "      <td>0.023686</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAK</th>\n",
       "      <td>0.034546</td>\n",
       "      <td>0.044418</td>\n",
       "      <td>0.026537</td>\n",
       "      <td>0.034546</td>\n",
       "      <td>0.034245</td>\n",
       "      <td>0.028591</td>\n",
       "      <td>0.029944</td>\n",
       "      <td>0.034158</td>\n",
       "      <td>0.022931</td>\n",
       "      <td>0.039439</td>\n",
       "      <td>...</td>\n",
       "      <td>0.016770</td>\n",
       "      <td>0.016770</td>\n",
       "      <td>0.016770</td>\n",
       "      <td>0.016770</td>\n",
       "      <td>0.016770</td>\n",
       "      <td>0.012682</td>\n",
       "      <td>0.016185</td>\n",
       "      <td>0.016560</td>\n",
       "      <td>0.016560</td>\n",
       "      <td>0.015434</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAKA</th>\n",
       "      <td>0.046027</td>\n",
       "      <td>0.063561</td>\n",
       "      <td>0.037668</td>\n",
       "      <td>0.046027</td>\n",
       "      <td>0.045247</td>\n",
       "      <td>0.165461</td>\n",
       "      <td>0.249327</td>\n",
       "      <td>0.334851</td>\n",
       "      <td>0.089497</td>\n",
       "      <td>0.337730</td>\n",
       "      <td>...</td>\n",
       "      <td>0.067518</td>\n",
       "      <td>0.067518</td>\n",
       "      <td>0.067518</td>\n",
       "      <td>0.067518</td>\n",
       "      <td>0.067518</td>\n",
       "      <td>0.042302</td>\n",
       "      <td>0.029532</td>\n",
       "      <td>0.027634</td>\n",
       "      <td>0.027634</td>\n",
       "      <td>0.033329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPR</th>\n",
       "      <td>0.028852</td>\n",
       "      <td>0.081183</td>\n",
       "      <td>0.025862</td>\n",
       "      <td>0.028852</td>\n",
       "      <td>0.036655</td>\n",
       "      <td>0.087990</td>\n",
       "      <td>0.085834</td>\n",
       "      <td>0.075096</td>\n",
       "      <td>0.097212</td>\n",
       "      <td>0.085585</td>\n",
       "      <td>...</td>\n",
       "      <td>0.029346</td>\n",
       "      <td>0.029346</td>\n",
       "      <td>0.029346</td>\n",
       "      <td>0.029346</td>\n",
       "      <td>0.029346</td>\n",
       "      <td>0.023655</td>\n",
       "      <td>0.049380</td>\n",
       "      <td>0.042392</td>\n",
       "      <td>0.042392</td>\n",
       "      <td>0.063355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPRT</th>\n",
       "      <td>0.026303</td>\n",
       "      <td>0.034723</td>\n",
       "      <td>0.024996</td>\n",
       "      <td>0.026303</td>\n",
       "      <td>0.026396</td>\n",
       "      <td>0.036421</td>\n",
       "      <td>0.042080</td>\n",
       "      <td>0.035949</td>\n",
       "      <td>0.052389</td>\n",
       "      <td>0.043779</td>\n",
       "      <td>...</td>\n",
       "      <td>0.027019</td>\n",
       "      <td>0.027019</td>\n",
       "      <td>0.027019</td>\n",
       "      <td>0.027019</td>\n",
       "      <td>0.027019</td>\n",
       "      <td>0.019727</td>\n",
       "      <td>0.034794</td>\n",
       "      <td>0.034302</td>\n",
       "      <td>0.034302</td>\n",
       "      <td>0.035777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYV</th>\n",
       "      <td>0.028913</td>\n",
       "      <td>0.037161</td>\n",
       "      <td>0.026876</td>\n",
       "      <td>0.028913</td>\n",
       "      <td>0.028558</td>\n",
       "      <td>0.039297</td>\n",
       "      <td>0.038931</td>\n",
       "      <td>0.034029</td>\n",
       "      <td>0.050133</td>\n",
       "      <td>0.039407</td>\n",
       "      <td>...</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.039761</td>\n",
       "      <td>0.037265</td>\n",
       "      <td>0.034266</td>\n",
       "      <td>0.071686</td>\n",
       "      <td>0.083807</td>\n",
       "      <td>0.083807</td>\n",
       "      <td>0.047445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVH</th>\n",
       "      <td>0.044654</td>\n",
       "      <td>0.075975</td>\n",
       "      <td>0.037013</td>\n",
       "      <td>0.044654</td>\n",
       "      <td>0.044827</td>\n",
       "      <td>0.027405</td>\n",
       "      <td>0.032578</td>\n",
       "      <td>0.027160</td>\n",
       "      <td>0.025439</td>\n",
       "      <td>0.032589</td>\n",
       "      <td>...</td>\n",
       "      <td>0.018007</td>\n",
       "      <td>0.018007</td>\n",
       "      <td>0.018007</td>\n",
       "      <td>0.018007</td>\n",
       "      <td>0.018007</td>\n",
       "      <td>0.018495</td>\n",
       "      <td>0.044023</td>\n",
       "      <td>0.046666</td>\n",
       "      <td>0.046666</td>\n",
       "      <td>0.038737</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVHV</th>\n",
       "      <td>0.029647</td>\n",
       "      <td>0.043371</td>\n",
       "      <td>0.027804</td>\n",
       "      <td>0.029647</td>\n",
       "      <td>0.029193</td>\n",
       "      <td>0.073780</td>\n",
       "      <td>0.083830</td>\n",
       "      <td>0.058161</td>\n",
       "      <td>0.112460</td>\n",
       "      <td>0.093533</td>\n",
       "      <td>...</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.036095</td>\n",
       "      <td>0.033998</td>\n",
       "      <td>0.033561</td>\n",
       "      <td>0.061220</td>\n",
       "      <td>0.065734</td>\n",
       "      <td>0.065734</td>\n",
       "      <td>0.052191</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29403 rows × 233 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "loci            HLA-A                                                         \\\n",
       "genotype   HLA-A01:01 HLA-A01:02 HLA-A01:03 HLA-A01:09 HLA-A01:23 HLA-A02:01   \n",
       "Peptide                                                                        \n",
       "AAAYYVGY     0.094719   0.096633   0.065167   0.094719   0.096370   0.029052   \n",
       "AAAYYVGYL    0.051921   0.079042   0.045578   0.051921   0.052842   0.227627   \n",
       "AAAYYVGYLQ   0.034828   0.045169   0.032080   0.034828   0.034498   0.035324   \n",
       "AACCHLAK     0.034546   0.044418   0.026537   0.034546   0.034245   0.028591   \n",
       "AACCHLAKA    0.046027   0.063561   0.037668   0.046027   0.045247   0.165461   \n",
       "...               ...        ...        ...        ...        ...        ...   \n",
       "YYVGYLQPR    0.028852   0.081183   0.025862   0.028852   0.036655   0.087990   \n",
       "YYVGYLQPRT   0.026303   0.034723   0.024996   0.026303   0.026396   0.036421   \n",
       "YYVWKSYV     0.028913   0.037161   0.026876   0.028913   0.028558   0.039297   \n",
       "YYVWKSYVH    0.044654   0.075975   0.037013   0.044654   0.044827   0.027405   \n",
       "YYVWKSYVHV   0.029647   0.043371   0.027804   0.029647   0.029193   0.073780   \n",
       "\n",
       "loci                                                    ...      HLA-C  \\\n",
       "genotype   HLA-A02:02 HLA-A02:03 HLA-A02:04 HLA-A02:05  ... HLA-C17:02   \n",
       "Peptide                                                 ...              \n",
       "AAAYYVGY     0.033360   0.031451   0.020923   0.036177  ...   0.029064   \n",
       "AAAYYVGYL    0.398928   0.321426   0.162347   0.512131  ...   0.340147   \n",
       "AAAYYVGYLQ   0.036939   0.033438   0.038991   0.040745  ...   0.038188   \n",
       "AACCHLAK     0.029944   0.034158   0.022931   0.039439  ...   0.016770   \n",
       "AACCHLAKA    0.249327   0.334851   0.089497   0.337730  ...   0.067518   \n",
       "...               ...        ...        ...        ...  ...        ...   \n",
       "YYVGYLQPR    0.085834   0.075096   0.097212   0.085585  ...   0.029346   \n",
       "YYVGYLQPRT   0.042080   0.035949   0.052389   0.043779  ...   0.027019   \n",
       "YYVWKSYV     0.038931   0.034029   0.050133   0.039407  ...   0.039761   \n",
       "YYVWKSYVH    0.032578   0.027160   0.025439   0.032589  ...   0.018007   \n",
       "YYVWKSYVHV   0.083830   0.058161   0.112460   0.093533  ...   0.036095   \n",
       "\n",
       "loci                                                                         \\\n",
       "genotype   HLA-C17:03 HLA-C17:04 HLA-C17:05 HLA-C17:06 HLA-C17:07   HLA-C18   \n",
       "Peptide                                                                       \n",
       "AAAYYVGY     0.029064   0.029064   0.029064   0.029064   0.024008  0.015958   \n",
       "AAAYYVGYL    0.340147   0.340147   0.340147   0.340147   0.297585  0.079491   \n",
       "AAAYYVGYLQ   0.038188   0.038188   0.038188   0.034922   0.034588  0.020381   \n",
       "AACCHLAK     0.016770   0.016770   0.016770   0.016770   0.012682  0.016185   \n",
       "AACCHLAKA    0.067518   0.067518   0.067518   0.067518   0.042302  0.029532   \n",
       "...               ...        ...        ...        ...        ...       ...   \n",
       "YYVGYLQPR    0.029346   0.029346   0.029346   0.029346   0.023655  0.049380   \n",
       "YYVGYLQPRT   0.027019   0.027019   0.027019   0.027019   0.019727  0.034794   \n",
       "YYVWKSYV     0.039761   0.039761   0.039761   0.037265   0.034266  0.071686   \n",
       "YYVWKSYVH    0.018007   0.018007   0.018007   0.018007   0.018495  0.044023   \n",
       "YYVWKSYVHV   0.036095   0.036095   0.036095   0.033998   0.033561  0.061220   \n",
       "\n",
       "loci                                         \n",
       "genotype   HLA-C18:01 HLA-C18:02 HLA-C18:03  \n",
       "Peptide                                      \n",
       "AAAYYVGY     0.016639   0.016639   0.014597  \n",
       "AAAYYVGYL    0.068130   0.068130   0.102213  \n",
       "AAAYYVGYLQ   0.018729   0.018729   0.023686  \n",
       "AACCHLAK     0.016560   0.016560   0.015434  \n",
       "AACCHLAKA    0.027634   0.027634   0.033329  \n",
       "...               ...        ...        ...  \n",
       "YYVGYLQPR    0.042392   0.042392   0.063355  \n",
       "YYVGYLQPRT   0.034302   0.034302   0.035777  \n",
       "YYVWKSYV     0.083807   0.083807   0.047445  \n",
       "YYVWKSYVH    0.046666   0.046666   0.038737  \n",
       "YYVWKSYVHV   0.065734   0.065734   0.052191  \n",
       "\n",
       "[29403 rows x 233 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = ens_combined\n",
    "\n",
    "a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])\n",
    "a74['loci'] = 'HLA-A'\n",
    "a74['genotype'] = 'HLA-A74'\n",
    "#a74\n",
    "\n",
    "c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()\n",
    "c17['loci'] = 'HLA-C'\n",
    "c17['genotype'] = 'HLA-C17'\n",
    "#c17\n",
    "\n",
    "c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()\n",
    "c18['loci'] = 'HLA-C'\n",
    "c18['genotype'] = 'HLA-C18'\n",
    "#c18\n",
    "\n",
    "data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(\n",
    "    index='Peptide',\n",
    "    columns=['loci', 'genotype'],\n",
    "    values='max_transformed',\n",
    ")\n",
    "data_pivot.to_pickle('mhc1_haplotype_max-ensemble_pivot.pkl.gz', protocol=2)\n",
    "data_pivot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mean ensemble of NetMHCpan4.0 and MHCflurry2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>loci</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-A</th>\n",
       "      <th>...</th>\n",
       "      <th colspan=\"10\" halign=\"left\">HLA-C</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genotype</th>\n",
       "      <th>HLA-A01:01</th>\n",
       "      <th>HLA-A01:02</th>\n",
       "      <th>HLA-A01:03</th>\n",
       "      <th>HLA-A01:09</th>\n",
       "      <th>HLA-A01:23</th>\n",
       "      <th>HLA-A02:01</th>\n",
       "      <th>HLA-A02:02</th>\n",
       "      <th>HLA-A02:03</th>\n",
       "      <th>HLA-A02:04</th>\n",
       "      <th>HLA-A02:05</th>\n",
       "      <th>...</th>\n",
       "      <th>HLA-C17:02</th>\n",
       "      <th>HLA-C17:03</th>\n",
       "      <th>HLA-C17:04</th>\n",
       "      <th>HLA-C17:05</th>\n",
       "      <th>HLA-C17:06</th>\n",
       "      <th>HLA-C17:07</th>\n",
       "      <th>HLA-C18</th>\n",
       "      <th>HLA-C18:01</th>\n",
       "      <th>HLA-C18:02</th>\n",
       "      <th>HLA-C18:03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peptide</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAAYYVGY</th>\n",
       "      <td>0.105505</td>\n",
       "      <td>0.143522</td>\n",
       "      <td>0.082048</td>\n",
       "      <td>0.105505</td>\n",
       "      <td>0.104677</td>\n",
       "      <td>0.037018</td>\n",
       "      <td>0.046493</td>\n",
       "      <td>0.051890</td>\n",
       "      <td>0.035161</td>\n",
       "      <td>0.053552</td>\n",
       "      <td>...</td>\n",
       "      <td>0.052115</td>\n",
       "      <td>0.052115</td>\n",
       "      <td>0.052115</td>\n",
       "      <td>0.052115</td>\n",
       "      <td>0.049692</td>\n",
       "      <td>0.044106</td>\n",
       "      <td>0.028880</td>\n",
       "      <td>0.028737</td>\n",
       "      <td>0.028737</td>\n",
       "      <td>0.029137</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYL</th>\n",
       "      <td>0.066290</td>\n",
       "      <td>0.099294</td>\n",
       "      <td>0.060903</td>\n",
       "      <td>0.066290</td>\n",
       "      <td>0.068799</td>\n",
       "      <td>0.243041</td>\n",
       "      <td>0.452552</td>\n",
       "      <td>0.409592</td>\n",
       "      <td>0.224231</td>\n",
       "      <td>0.538487</td>\n",
       "      <td>...</td>\n",
       "      <td>0.398911</td>\n",
       "      <td>0.398911</td>\n",
       "      <td>0.398911</td>\n",
       "      <td>0.398911</td>\n",
       "      <td>0.398085</td>\n",
       "      <td>0.353328</td>\n",
       "      <td>0.105936</td>\n",
       "      <td>0.082313</td>\n",
       "      <td>0.082313</td>\n",
       "      <td>0.145829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAYYVGYLQ</th>\n",
       "      <td>0.050816</td>\n",
       "      <td>0.073134</td>\n",
       "      <td>0.043374</td>\n",
       "      <td>0.050816</td>\n",
       "      <td>0.048346</td>\n",
       "      <td>0.055380</td>\n",
       "      <td>0.091835</td>\n",
       "      <td>0.087515</td>\n",
       "      <td>0.050330</td>\n",
       "      <td>0.101244</td>\n",
       "      <td>...</td>\n",
       "      <td>0.052227</td>\n",
       "      <td>0.052227</td>\n",
       "      <td>0.052227</td>\n",
       "      <td>0.052227</td>\n",
       "      <td>0.050746</td>\n",
       "      <td>0.047234</td>\n",
       "      <td>0.030836</td>\n",
       "      <td>0.029171</td>\n",
       "      <td>0.029171</td>\n",
       "      <td>0.034168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAK</th>\n",
       "      <td>0.042553</td>\n",
       "      <td>0.071591</td>\n",
       "      <td>0.035676</td>\n",
       "      <td>0.042553</td>\n",
       "      <td>0.041701</td>\n",
       "      <td>0.038747</td>\n",
       "      <td>0.039441</td>\n",
       "      <td>0.040569</td>\n",
       "      <td>0.040557</td>\n",
       "      <td>0.046198</td>\n",
       "      <td>...</td>\n",
       "      <td>0.052035</td>\n",
       "      <td>0.052035</td>\n",
       "      <td>0.052035</td>\n",
       "      <td>0.052035</td>\n",
       "      <td>0.048827</td>\n",
       "      <td>0.051112</td>\n",
       "      <td>0.033741</td>\n",
       "      <td>0.033363</td>\n",
       "      <td>0.033363</td>\n",
       "      <td>0.034467</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACCHLAKA</th>\n",
       "      <td>0.050785</td>\n",
       "      <td>0.077612</td>\n",
       "      <td>0.043736</td>\n",
       "      <td>0.050785</td>\n",
       "      <td>0.050535</td>\n",
       "      <td>0.216629</td>\n",
       "      <td>0.305789</td>\n",
       "      <td>0.385049</td>\n",
       "      <td>0.150765</td>\n",
       "      <td>0.397554</td>\n",
       "      <td>...</td>\n",
       "      <td>0.124579</td>\n",
       "      <td>0.124579</td>\n",
       "      <td>0.124579</td>\n",
       "      <td>0.124579</td>\n",
       "      <td>0.124007</td>\n",
       "      <td>0.097200</td>\n",
       "      <td>0.051970</td>\n",
       "      <td>0.045665</td>\n",
       "      <td>0.045665</td>\n",
       "      <td>0.063569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPR</th>\n",
       "      <td>0.047186</td>\n",
       "      <td>0.108518</td>\n",
       "      <td>0.041570</td>\n",
       "      <td>0.047186</td>\n",
       "      <td>0.053111</td>\n",
       "      <td>0.101366</td>\n",
       "      <td>0.111385</td>\n",
       "      <td>0.100820</td>\n",
       "      <td>0.123275</td>\n",
       "      <td>0.128891</td>\n",
       "      <td>...</td>\n",
       "      <td>0.052604</td>\n",
       "      <td>0.052604</td>\n",
       "      <td>0.052604</td>\n",
       "      <td>0.052604</td>\n",
       "      <td>0.050537</td>\n",
       "      <td>0.050905</td>\n",
       "      <td>0.071012</td>\n",
       "      <td>0.067523</td>\n",
       "      <td>0.067523</td>\n",
       "      <td>0.077248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVGYLQPRT</th>\n",
       "      <td>0.034253</td>\n",
       "      <td>0.051494</td>\n",
       "      <td>0.030995</td>\n",
       "      <td>0.034253</td>\n",
       "      <td>0.035273</td>\n",
       "      <td>0.075122</td>\n",
       "      <td>0.090740</td>\n",
       "      <td>0.087005</td>\n",
       "      <td>0.076090</td>\n",
       "      <td>0.101528</td>\n",
       "      <td>...</td>\n",
       "      <td>0.033389</td>\n",
       "      <td>0.033389</td>\n",
       "      <td>0.033389</td>\n",
       "      <td>0.033389</td>\n",
       "      <td>0.033349</td>\n",
       "      <td>0.029607</td>\n",
       "      <td>0.043535</td>\n",
       "      <td>0.040844</td>\n",
       "      <td>0.040844</td>\n",
       "      <td>0.048675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYV</th>\n",
       "      <td>0.038279</td>\n",
       "      <td>0.053743</td>\n",
       "      <td>0.035277</td>\n",
       "      <td>0.038279</td>\n",
       "      <td>0.038825</td>\n",
       "      <td>0.066829</td>\n",
       "      <td>0.076117</td>\n",
       "      <td>0.081250</td>\n",
       "      <td>0.071388</td>\n",
       "      <td>0.076305</td>\n",
       "      <td>...</td>\n",
       "      <td>0.051826</td>\n",
       "      <td>0.051826</td>\n",
       "      <td>0.051826</td>\n",
       "      <td>0.051826</td>\n",
       "      <td>0.051655</td>\n",
       "      <td>0.048461</td>\n",
       "      <td>0.087633</td>\n",
       "      <td>0.090868</td>\n",
       "      <td>0.090868</td>\n",
       "      <td>0.078235</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVH</th>\n",
       "      <td>0.054932</td>\n",
       "      <td>0.117858</td>\n",
       "      <td>0.047842</td>\n",
       "      <td>0.054932</td>\n",
       "      <td>0.057730</td>\n",
       "      <td>0.037492</td>\n",
       "      <td>0.041749</td>\n",
       "      <td>0.034766</td>\n",
       "      <td>0.040445</td>\n",
       "      <td>0.046134</td>\n",
       "      <td>...</td>\n",
       "      <td>0.030284</td>\n",
       "      <td>0.030284</td>\n",
       "      <td>0.030284</td>\n",
       "      <td>0.030284</td>\n",
       "      <td>0.029886</td>\n",
       "      <td>0.030914</td>\n",
       "      <td>0.050247</td>\n",
       "      <td>0.052319</td>\n",
       "      <td>0.052319</td>\n",
       "      <td>0.046045</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YYVWKSYVHV</th>\n",
       "      <td>0.048144</td>\n",
       "      <td>0.074219</td>\n",
       "      <td>0.045419</td>\n",
       "      <td>0.048144</td>\n",
       "      <td>0.052338</td>\n",
       "      <td>0.147539</td>\n",
       "      <td>0.168330</td>\n",
       "      <td>0.143611</td>\n",
       "      <td>0.187243</td>\n",
       "      <td>0.180384</td>\n",
       "      <td>...</td>\n",
       "      <td>0.074730</td>\n",
       "      <td>0.074730</td>\n",
       "      <td>0.074730</td>\n",
       "      <td>0.074730</td>\n",
       "      <td>0.074137</td>\n",
       "      <td>0.064389</td>\n",
       "      <td>0.083379</td>\n",
       "      <td>0.081737</td>\n",
       "      <td>0.081737</td>\n",
       "      <td>0.084716</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29403 rows × 233 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "loci            HLA-A                                                         \\\n",
       "genotype   HLA-A01:01 HLA-A01:02 HLA-A01:03 HLA-A01:09 HLA-A01:23 HLA-A02:01   \n",
       "Peptide                                                                        \n",
       "AAAYYVGY     0.105505   0.143522   0.082048   0.105505   0.104677   0.037018   \n",
       "AAAYYVGYL    0.066290   0.099294   0.060903   0.066290   0.068799   0.243041   \n",
       "AAAYYVGYLQ   0.050816   0.073134   0.043374   0.050816   0.048346   0.055380   \n",
       "AACCHLAK     0.042553   0.071591   0.035676   0.042553   0.041701   0.038747   \n",
       "AACCHLAKA    0.050785   0.077612   0.043736   0.050785   0.050535   0.216629   \n",
       "...               ...        ...        ...        ...        ...        ...   \n",
       "YYVGYLQPR    0.047186   0.108518   0.041570   0.047186   0.053111   0.101366   \n",
       "YYVGYLQPRT   0.034253   0.051494   0.030995   0.034253   0.035273   0.075122   \n",
       "YYVWKSYV     0.038279   0.053743   0.035277   0.038279   0.038825   0.066829   \n",
       "YYVWKSYVH    0.054932   0.117858   0.047842   0.054932   0.057730   0.037492   \n",
       "YYVWKSYVHV   0.048144   0.074219   0.045419   0.048144   0.052338   0.147539   \n",
       "\n",
       "loci                                                    ...      HLA-C  \\\n",
       "genotype   HLA-A02:02 HLA-A02:03 HLA-A02:04 HLA-A02:05  ... HLA-C17:02   \n",
       "Peptide                                                 ...              \n",
       "AAAYYVGY     0.046493   0.051890   0.035161   0.053552  ...   0.052115   \n",
       "AAAYYVGYL    0.452552   0.409592   0.224231   0.538487  ...   0.398911   \n",
       "AAAYYVGYLQ   0.091835   0.087515   0.050330   0.101244  ...   0.052227   \n",
       "AACCHLAK     0.039441   0.040569   0.040557   0.046198  ...   0.052035   \n",
       "AACCHLAKA    0.305789   0.385049   0.150765   0.397554  ...   0.124579   \n",
       "...               ...        ...        ...        ...  ...        ...   \n",
       "YYVGYLQPR    0.111385   0.100820   0.123275   0.128891  ...   0.052604   \n",
       "YYVGYLQPRT   0.090740   0.087005   0.076090   0.101528  ...   0.033389   \n",
       "YYVWKSYV     0.076117   0.081250   0.071388   0.076305  ...   0.051826   \n",
       "YYVWKSYVH    0.041749   0.034766   0.040445   0.046134  ...   0.030284   \n",
       "YYVWKSYVHV   0.168330   0.143611   0.187243   0.180384  ...   0.074730   \n",
       "\n",
       "loci                                                                         \\\n",
       "genotype   HLA-C17:03 HLA-C17:04 HLA-C17:05 HLA-C17:06 HLA-C17:07   HLA-C18   \n",
       "Peptide                                                                       \n",
       "AAAYYVGY     0.052115   0.052115   0.052115   0.049692   0.044106  0.028880   \n",
       "AAAYYVGYL    0.398911   0.398911   0.398911   0.398085   0.353328  0.105936   \n",
       "AAAYYVGYLQ   0.052227   0.052227   0.052227   0.050746   0.047234  0.030836   \n",
       "AACCHLAK     0.052035   0.052035   0.052035   0.048827   0.051112  0.033741   \n",
       "AACCHLAKA    0.124579   0.124579   0.124579   0.124007   0.097200  0.051970   \n",
       "...               ...        ...        ...        ...        ...       ...   \n",
       "YYVGYLQPR    0.052604   0.052604   0.052604   0.050537   0.050905  0.071012   \n",
       "YYVGYLQPRT   0.033389   0.033389   0.033389   0.033349   0.029607  0.043535   \n",
       "YYVWKSYV     0.051826   0.051826   0.051826   0.051655   0.048461  0.087633   \n",
       "YYVWKSYVH    0.030284   0.030284   0.030284   0.029886   0.030914  0.050247   \n",
       "YYVWKSYVHV   0.074730   0.074730   0.074730   0.074137   0.064389  0.083379   \n",
       "\n",
       "loci                                         \n",
       "genotype   HLA-C18:01 HLA-C18:02 HLA-C18:03  \n",
       "Peptide                                      \n",
       "AAAYYVGY     0.028737   0.028737   0.029137  \n",
       "AAAYYVGYL    0.082313   0.082313   0.145829  \n",
       "AAAYYVGYLQ   0.029171   0.029171   0.034168  \n",
       "AACCHLAK     0.033363   0.033363   0.034467  \n",
       "AACCHLAKA    0.045665   0.045665   0.063569  \n",
       "...               ...        ...        ...  \n",
       "YYVGYLQPR    0.067523   0.067523   0.077248  \n",
       "YYVGYLQPRT   0.040844   0.040844   0.048675  \n",
       "YYVWKSYV     0.090868   0.090868   0.078235  \n",
       "YYVWKSYVH    0.052319   0.052319   0.046045  \n",
       "YYVWKSYVHV   0.081737   0.081737   0.084716  \n",
       "\n",
       "[29403 rows x 233 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "netmhc_pivot = pd.read_pickle('mhc1_haplotype_netmhc_pred_affinity_pivot.pkl.gz')\n",
    "mhcflurry_pivot = pd.read_pickle('mhc1_haplotype_mhcflurry2.0_pred_affinity_pivot.pkl.gz')\n",
    "assert netmhc_pivot.shape == mhcflurry_pivot.shape\n",
    "assert set(netmhc_pivot.T.index.values.tolist()) == set(mhcflurry_pivot.T.index.values.tolist())\n",
    "assert set(netmhc_pivot.index.values.tolist()) == set(mhcflurry_pivot.index.values.tolist())\n",
    "\n",
    "netmhc_pivot_nm = 50000**(1-netmhc_pivot)\n",
    "mhcflurry_pivot_nm = 50000**(1-mhcflurry_pivot)\n",
    "\n",
    "ens_pivot_nm = (netmhc_pivot_nm + mhcflurry_pivot_nm) / 2\n",
    "ens_pivot = 1 - np.log(ens_pivot_nm) / np.log(50000)\n",
    "\n",
    "ens_pivot.to_pickle('mhc1_netmhcpan4.0_mhcflurry2.0_ensemble_affinity.pkl.gz', protocol=2)\n",
    "ens_pivot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PUFFIN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>protein</th>\n",
       "      <th>sequence</th>\n",
       "      <th>start_pos</th>\n",
       "      <th>epi_len</th>\n",
       "      <th>entropy</th>\n",
       "      <th>perc_mutated</th>\n",
       "      <th>glyco_probs</th>\n",
       "      <th>crosses_cleavage</th>\n",
       "      <th>sequence_length</th>\n",
       "      <th>allele</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-A30:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763375</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B55:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763376</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B67:01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763377</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-A24:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763378</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B15:32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763379</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6763380 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        protein    sequence  start_pos  epi_len   entropy  perc_mutated  \\\n",
       "0             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "1             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "2             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "3             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "4             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "...         ...         ...        ...      ...       ...           ...   \n",
       "6763375      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763376      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763377      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763378      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763379      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "\n",
       "         glyco_probs  crosses_cleavage  sequence_length      allele  \n",
       "0                0.0                 0                8  HLA-B44:04  \n",
       "1                0.0                 0                8  HLA-B44:05  \n",
       "2                0.0                 0                8  HLA-B44:07  \n",
       "3                0.0                 0                8  HLA-A30:10  \n",
       "4                0.0                 0                8  HLA-B44:02  \n",
       "...              ...               ...              ...         ...  \n",
       "6763375          0.0                 0               10  HLA-B55:02  \n",
       "6763376          0.0                 0               10  HLA-B67:01  \n",
       "6763377          0.0                 0               10  HLA-A24:10  \n",
       "6763378          0.0                 0               10  HLA-B15:32  \n",
       "6763379          0.0                 0               10  HLA-B56:10  \n",
       "\n",
       "[6763380 rows x 10 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create dataframe with all MHC/peptide pairs.\n",
    "a = mhc1_data.copy()\n",
    "b = hla_alleles.copy()\n",
    "a['key'] = 0\n",
    "b['key'] = 0\n",
    "pmhc_pairs = a.merge(b, how='outer')\n",
    "pmhc_pairs = pmhc_pairs.drop(columns=['key'])\n",
    "pmhc_pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>protein</th>\n",
       "      <th>sequence</th>\n",
       "      <th>start_pos</th>\n",
       "      <th>epi_len</th>\n",
       "      <th>entropy</th>\n",
       "      <th>perc_mutated</th>\n",
       "      <th>glyco_probs</th>\n",
       "      <th>crosses_cleavage</th>\n",
       "      <th>sequence_length</th>\n",
       "      <th>allele</th>\n",
       "      <th>placeholder</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:04</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:05</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:07</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-A30:10</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>MYSFVSEE</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0.002908</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>HLA-B44:02</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763375</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B55:02</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763376</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B67:01</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763377</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-A24:10</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763378</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B15:32</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6763379</th>\n",
       "      <td>S2</td>\n",
       "      <td>VLKGVKLHYT</td>\n",
       "      <td>1263</td>\n",
       "      <td>10</td>\n",
       "      <td>0.011631</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>HLA-B56:10</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6763380 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        protein    sequence  start_pos  epi_len   entropy  perc_mutated  \\\n",
       "0             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "1             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "2             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "3             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "4             E    MYSFVSEE          0        8  0.002908      0.000213   \n",
       "...         ...         ...        ...      ...       ...           ...   \n",
       "6763375      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763376      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763377      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763378      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "6763379      S2  VLKGVKLHYT       1263       10  0.011631      0.000853   \n",
       "\n",
       "         glyco_probs  crosses_cleavage  sequence_length      allele  \\\n",
       "0                0.0                 0                8  HLA-B44:04   \n",
       "1                0.0                 0                8  HLA-B44:05   \n",
       "2                0.0                 0                8  HLA-B44:07   \n",
       "3                0.0                 0                8  HLA-A30:10   \n",
       "4                0.0                 0                8  HLA-B44:02   \n",
       "...              ...               ...              ...         ...   \n",
       "6763375          0.0                 0               10  HLA-B55:02   \n",
       "6763376          0.0                 0               10  HLA-B67:01   \n",
       "6763377          0.0                 0               10  HLA-A24:10   \n",
       "6763378          0.0                 0               10  HLA-B15:32   \n",
       "6763379          0.0                 0               10  HLA-B56:10   \n",
       "\n",
       "         placeholder  \n",
       "0                 -1  \n",
       "1                 -1  \n",
       "2                 -1  \n",
       "3                 -1  \n",
       "4                 -1  \n",
       "...              ...  \n",
       "6763375           -1  \n",
       "6763376           -1  \n",
       "6763377           -1  \n",
       "6763378           -1  \n",
       "6763379           -1  \n",
       "\n",
       "[6763380 rows x 11 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pmhc_pairs['placeholder'] = -1\n",
    "pmhc_pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "pmhc_pairs[['sequence', 'placeholder', 'allele']].to_csv(\n",
    "    'puffin_mhc1_preds/datafile.tsv', sep='\\t', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Commands to run PUFFIN:\n",
    "```\n",
    "cd /path/to/PUFFIN\n",
    "\n",
    "python preprocess.py \\\n",
    "    -i puffin/puffin_mhc1_preds/datafile.tsv \\\n",
    "    -o puffin/puffin_mhc1_preds/puffin_outdir \\\n",
    "    -c 1\n",
    "    \n",
    "python score.py \\\n",
    "    -o puffin/puffin_mhc1_preds/puffin_outdir \\\n",
    "    -c 1 \\\n",
    "    -g 0\n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
