{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "cededbf4",
   "metadata": {},
   "source": [
    "### Input data\n",
    "\n",
    "1. Download 'CoV-AbDab_080224.csv' from this [link](https://opig.stats.ox.ac.uk/webapps/covabdab/static/downloads/CoV-AbDab_080224.csv). \n",
    "2. Use the 'covid_variants.fasta' file from this directory (originally from [here](https://viralzone.expasy.org/9556))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a084344c-cfe3-4ea2-84a2-5f209caaafc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from Bio import SeqIO\n",
    "\n",
    "full_df = pd.read_csv('CoV-AbDab_080224.csv')\n",
    "ab_df = full_df[full_df['VHorVHH'] != 'ND']\n",
    "ab_df = full_df[full_df['VL'] != 'ND']\n",
    "ab_df = ab_df[ab_df['Ab or Nb'] == 'Ab']\n",
    "ab_df = ab_df[['Neutralising Vs', 'Not Neutralising Vs', 'VHorVHH', 'VL', 'Origin', 'Protein + Epitope', 'Name']]\n",
    "ab_df = ab_df.fillna('')\n",
    "ab_df.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "e1e91e55-e00b-43ec-bcc1-cca847d3f804",
   "metadata": {},
   "outputs": [],
   "source": [
    "sources_to_keep = ['B-cells; SARS-CoV2_WT Convalescent Patient (Unvaccinated)', \n",
    "                         'B-cells; SARS-CoV1 Human Patient; SARS-CoV2 Vaccinee',\n",
    "                         'B-cells; SARS-CoV2_WT Convalescent Patients',\n",
    "                         'B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV)',\n",
    "                         'B-cells; SARS-CoV2_WT Vaccinee',\n",
    "                         'B-cells; SARS-CoV2_WT Human Patient',\n",
    "                         'B-cells; Unvaccinated SARS-CoV2_WT Human Patient',\n",
    "                         'B-cells; SARS-CoV2_Gamma Human Patient',\n",
    "                         'B-cells; SARS-CoV1 Human Patient',\n",
    "                         'B-cells (SARS-CoV2_Beta Human Patient)'\n",
    "                        ]\n",
    "\n",
    "binding_to_keep = [\"S; RBD\", \"S: RBD\", \"S; RBD/NTD\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "45ef5e95-f066-46e8-83f2-1e6cf1943c0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "source_df = ab_df[ab_df['Origin'].isin(sources_to_keep)]\n",
    "source_df = source_df[source_df['Protein + Epitope'].isin(binding_to_keep)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "a5337c99-969d-4aef-b529-49c403cf2781",
   "metadata": {},
   "outputs": [],
   "source": [
    "heavy_chains = []\n",
    "light_chains = []\n",
    "antigens = []\n",
    "target = []\n",
    "names = []\n",
    "origins = []\n",
    "\n",
    "for i,row in source_df.iterrows():\n",
    "    neut_ags = row['Neutralising Vs'].split(';')\n",
    "    no_neut_ags = row['Not Neutralising Vs'].split(';')\n",
    "    name = row['Name']\n",
    "    origin = row['Origin']\n",
    "\n",
    "    hc = row['VHorVHH']\n",
    "    lc = row['VL']\n",
    "\n",
    "    for n in neut_ags:\n",
    "        if n != '':\n",
    "            heavy_chains.append(hc)\n",
    "            light_chains.append(lc)\n",
    "            antigens.append(n)\n",
    "            target.append(1)\n",
    "            names.append(name)\n",
    "            origins.append(origin)\n",
    "\n",
    "    for n in no_neut_ags:\n",
    "        if n != '':\n",
    "            heavy_chains.append(hc)\n",
    "            light_chains.append(lc)\n",
    "            antigens.append(n)\n",
    "            target.append(0)\n",
    "            names.append(name)\n",
    "            origins.append(origin)\n",
    "\n",
    "interaction_df = pd.DataFrame({'names': names,\n",
    "                               'origins': origins, \n",
    "                               'heavy': heavy_chains, \n",
    "                              'light': light_chains,\n",
    "                              'antigens': antigens,\n",
    "                              'target': target})    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "bc0b0bf4-8422-4058-be8a-afb1c32f0241",
   "metadata": {},
   "outputs": [],
   "source": [
    "grouping_dict = {\n",
    "    'SARS-CoV2_WT': ['SARS-CoV2_WT', 'SARS-CoV2_WT (weak)', 'SARS-CoV2_WT and SARS-CoV1', 'SARS-CoV2_WT, SARS-CoV2_Delta', 'SARS-CoV2_WT_Delta (weak)', 'SARS-CoV2_WT (weak) , SARS-CoV2_Delta (weak)'],\n",
    "    'SARS-CoV2_Alpha': ['SARS-CoV2_Alpha', 'SARS-CoV2_Alpha (weak)'],\n",
    "    'SARS-CoV2_Beta': ['SARS-CoV2_Beta', 'SARS-CoV2_Beta (weak)', 'SRAS-CoV2_Beta'],\n",
    "    'SARS-CoV2_Delta': ['SARS-CoV2_Delta', 'SARS-CoV2_Delta (weak)'],\n",
    "    'SARS-CoV2_Epsilon': ['SARS-CoV2_Epsilon', 'SARS-CoV2_Epsilon (weak)'],\n",
    "    'SARS-CoV2_Gamma': ['SARS_CoV2_Gamma', 'SARS-CoV2_Gamma', 'SARS-CoV2_Gamma (weak)'],\n",
    "    'SARS-CoV2_Eta': ['SARS-CoV2_Eta', 'SARS-CoV2_Eta (weak)'],\n",
    "    'SARS-CoV2_Iota': ['SARS-CoV2_Iota', 'SARS-CoV2_Iota (weak)'],\n",
    "    'SARS-CoV2_Lambda': ['SARS-CoV2_Lambda', 'SARS-CoV2_Lambda (weak)'],\n",
    "    'SARS-CoV2_Kappa': ['SARS-CoV2_Kappa', 'SARS-CoV2_Kappa (weak)'],\n",
    "    'SARS-CoV2_Omicron-BA1': ['SARS-CoV2_Omicron-BA1', 'SARS-CoV2_Omicron-BA1 (weak)', 'SARS-CoV2_Omicron_BA1', 'SARS-CoV2_Omicron_BA1.1', 'SARS-CoV2_Omicron-BA1.1 (weak)'],\n",
    "    'SARS-CoV2_Omicron-BA2': ['SARS-CoV2_Omicron-BA2', 'SARS-CoV2_Omicron-BA2 (weak)', 'SARS_COV2_Omicron-BA2', 'SARS-CoV2_Omicron-BA2.12.1', 'SARS-CoV2_Omicron-BA2.12.1 (weak)', 'SARS-CoV2_Omicron-BA2.75', 'SARS-CoV2_Omicron-BA2.38', 'SARS-CoV2_Omicron-BA2.38 (weak)', 'SARS-CoV2_Omicron-BA2.75.1', 'SARS-CoV2_Omicron-BA2.75.5', 'SARS-CoV2_Omicron-BA2.75.5 (weak)'],\n",
    "    'SARS-CoV2_Omicron-BA4': ['SARS-CoV2_Omicron-BA4', 'SARS-CoV2_Omicron-BA4 (weak)', 'SARS-CoV2_Omicron-BA4/BA', 'SARS-CoV2_Omicron-BA4.6', 'SARS-CoV2_Omicron-BA4.6 (weak)', 'SARS-CoV2_Omicron-BA4.7', 'SARS-CoV2_Omicron-BA4.7 (weak)'],\n",
    "    'SARS-CoV2_Omicron-BA5': ['SARS-CoV2_Omicron-BA5', 'SARS-CoV2_Omicron-BA5 (weak)', 'SARS-CoV2_Omicron-BA5.9', 'SARS-CoV2_Omicron-BA5.9 (weak)'],\n",
    "    'SARS-CoV2_Omicron-XBB': ['SARS-CoV2_Omicron-XBB']\n",
    "}\n",
    "\n",
    "test_groups = ['SARS-CoV2_Omicron-BA1', 'SARS-CoV2_Omicron-BA2', 'SARS-CoV2_Omicron-BA4', \n",
    "               'SARS-CoV2_Omicron-BA5']\n",
    "\n",
    "reversed_dict = {}\n",
    "for key, value_list in grouping_dict.items():\n",
    "    for value in value_list:\n",
    "        reversed_dict[value] = key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "a1bb6165-ba8f-433a-83fa-f1775f5e204e",
   "metadata": {},
   "outputs": [],
   "source": [
    "interaction_df['groups'] = interaction_df['antigens'].apply(lambda x: reversed_dict.get(x, pd.NA))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "6c8423e2-0ee4-4cb1-a5d4-3afe136349d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "interaction_df = interaction_df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "e49ce353-4b52-41f6-9bfe-c5d9f89adef8",
   "metadata": {},
   "outputs": [],
   "source": [
    "records = list(SeqIO.parse(\"covid_variants.fasta\", \"fasta\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "9ee43269-b9f7-4578-b05c-e1c1cd703a25",
   "metadata": {},
   "outputs": [],
   "source": [
    "variant_dict = {rec.id:str(rec.seq) for rec in records}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "66ffa16f-87b9-45a2-94a2-f0b523b90b4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "interaction_df['covid_seq'] = interaction_df['groups'].apply(lambda x:variant_dict[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "3bdb3f4c-36da-480d-b217-b6a63ea3f39d",
   "metadata": {},
   "outputs": [],
   "source": [
    "interaction_df = interaction_df.drop_duplicates(subset=['names', 'groups'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "6276e8d8-6363-43d7-b8e9-87672cbbc7e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_interaction_df = interaction_df[~interaction_df['groups'].isin(test_groups)]\n",
    "test_interaction_df = interaction_df[interaction_df['groups'].isin(test_groups)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "06a34df4-e336-4a30-8f77-c41d077af82c",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_interaction_df.to_csv('processed_data_train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "193430bf-c653-4e40-81c9-a69d51308db5",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_interaction_df.to_csv('processed_data_test.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
