{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## I use this file to already traslate all the MSAs so that we do not have to do it al loading time every time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os, sys\n",
    "\n",
    "\n",
    "sys.path.insert(1, \"./../util\")\n",
    "sys.path.insert(1, \"./../model\")\n",
    "from pseudolikelihood import get_npll2, get_npll_indep\n",
    "import torch, torchvision\n",
    "from biotite.structure.io import pdbx, pdb\n",
    "\n",
    "\n",
    "from ioutils import read_fasta, read_encodings\n",
    "from torch.nn.utils.rnn import pad_sequence\n",
    "from collections import defaultdict\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We are at iteration:2 out of 45124\r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_829818/1296144023.py:10: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
      "  msa = torch.tensor(msa, dtype=torch.uint8)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We are at iteration:22562 out of 45124\r"
     ]
    }
   ],
   "source": [
    "train_dir = \"./../split2/train/\"\n",
    "nfiles = len(os.listdir(train_dir))\n",
    "iterator=0\n",
    "for fname in os.listdir(train_dir):\n",
    "    if fname.endswith(\"a3m\"):\n",
    "        iterator+=1\n",
    "        print(f\"We are at iteration:{iterator} out of {nfiles}\", end=\"\\r\")\n",
    "        fpath = os.path.join(train_dir, fname)\n",
    "        msa, q = read_fasta(fpath)\n",
    "        msa = torch.tensor(msa, dtype=torch.uint8)\n",
    "        fsave = os.path.join(train_dir, fname + \".pt\")\n",
    "        torch.save(msa, fsave)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We are at iteration:3 out of 2750\r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_829818/2810660318.py:10: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
      "  msa = torch.tensor(msa, dtype=torch.uint8)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We are at iteration:1375 out of 2750\r"
     ]
    }
   ],
   "source": [
    "#### Do for all test sets\n",
    "train_dir = \"./../split2/test/structure\"\n",
    "nfiles = len(os.listdir(train_dir))\n",
    "iterator=0\n",
    "for fname in os.listdir(train_dir):\n",
    "    if fname.endswith(\"a3m\"):\n",
    "        iterator+=1\n",
    "        print(f\"We are at iteration:{iterator} out of {nfiles}\", end=\"\\r\")\n",
    "        fpath = os.path.join(train_dir, fname)\n",
    "        msa, q = read_fasta(fpath)\n",
    "        ## Fundamental, this saves roughly 150GBs of memory on disk\n",
    "        msa = torch.tensor(msa, dtype=torch.uint8)\n",
    "        fsave = os.path.join(train_dir, fname + \".pt\")\n",
    "        torch.save(msa, fsave)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "IF",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
