{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "02c7951d-3fcb-4633-84a2-ba6d5a41d71e",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🧬 Starting Immunogenomics Pathogen Detection Pipeline\n",
      "============================================================\n",
      "=== Environment Setup ===\n",
      "✓ fasta folder already exists\n",
      "✓ CDR3_MHC.csv file found\n",
      "✓ infer.py file found\n",
      "=== Downloading Pathogen Proteomes from UniProt ===\n",
      "✓ Mycobacterium tuberculosis proteome already exists\n",
      "✓ Salmonella enterica proteome already exists\n",
      "✓ Escherichia coli proteome already exists\n",
      "✓ Staphylococcus aureus proteome already exists\n",
      "✓ Candida albicans proteome already exists\n",
      "✓ Aspergillus fumigatus proteome already exists\n",
      "✓ Human immunodeficiency virus 1 proteome already exists\n",
      "✓ Hepatitis B virus proteome already exists\n",
      "✓ Epstein-Barr virus proteome already exists\n",
      "✓ Cytomegalovirus proteome already exists\n",
      "=== Building Pathogen Peptide Database ===\n",
      "✓ Mycobacterium tuberculosis: 729 peptides\n",
      "✓ Salmonella enterica: 711 peptides\n",
      "✓ Escherichia coli: 717 peptides\n",
      "✓ Staphylococcus aureus: 717 peptides\n",
      "✓ Candida albicans: 648 peptides\n",
      "✓ Aspergillus fumigatus: 482 peptides\n",
      "✓ Human immunodeficiency virus 1: 630 peptides\n",
      "✓ Hepatitis B virus: 561 peptides\n",
      "✓ Epstein-Barr virus: 624 peptides\n",
      "✓ Cytomegalovirus: 441 peptides\n",
      "✓ Total loaded 6260 peptides from 10 organisms\n",
      "=== Loading CDR3-MHC Data ===\n",
      "✓ Loaded 60 CDR3-MHC samples\n",
      "=== Generating Peptide Candidates ===\n",
      "🔬 Processing sample 1/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 2/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 3/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 4/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 5/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 6/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 7/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 8/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 9/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 10/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 11/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 12/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 13/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 14/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 15/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 16/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 17/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 18/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 19/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 20/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 21/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 22/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 23/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 24/60\n",
      "  ✓ Generated 19 peptide candidates\n",
      "🔬 Processing sample 25/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 26/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 27/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 28/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 29/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 30/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 31/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 32/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 33/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 34/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 35/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 36/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 37/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 38/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 39/60\n",
      "  ✓ Generated 19 peptide candidates\n",
      "🔬 Processing sample 40/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 41/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 42/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 43/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 44/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 45/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 46/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 47/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 48/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 49/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 50/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 51/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 52/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 53/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 54/60\n",
      "  ✓ Generated 19 peptide candidates\n",
      "🔬 Processing sample 55/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 56/60\n",
      "  ✓ Generated 5 peptide candidates\n",
      "🔬 Processing sample 57/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 58/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 59/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "🔬 Processing sample 60/60\n",
      "  ✓ Generated 20 peptide candidates\n",
      "=== Pathogen Matching Analysis ===\n",
      "🔍 Matching sample 1\n",
      "  ➖ Sample 1: No positive matches\n",
      "🔍 Matching sample 2\n",
      "  ➖ Sample 2: No positive matches\n",
      "🔍 Matching sample 3\n",
      "  ➖ Sample 3: No positive matches\n",
      "🔍 Matching sample 4\n",
      "  ➖ Sample 4: No positive matches\n",
      "🔍 Matching sample 5\n",
      "  ➖ Sample 5: No positive matches\n",
      "🔍 Matching sample 6\n",
      "  ➖ Sample 6: No positive matches\n",
      "🔍 Matching sample 7\n",
      "  ➖ Sample 7: No positive matches\n",
      "🔍 Matching sample 8\n",
      "  ➖ Sample 8: No positive matches\n",
      "🔍 Matching sample 9\n",
      "  ➖ Sample 9: No positive matches\n",
      "🔍 Matching sample 10\n",
      "  ➖ Sample 10: No positive matches\n",
      "🔍 Matching sample 11\n",
      "  🎯 Matched to Mycobacterium tuberculosis: 0.231\n",
      "  ✓ Sample 11: 1 positive matches\n",
      "🔍 Matching sample 12\n",
      "  ➖ Sample 12: No positive matches\n",
      "🔍 Matching sample 13\n",
      "  ➖ Sample 13: No positive matches\n",
      "🔍 Matching sample 14\n",
      "  ➖ Sample 14: No positive matches\n",
      "🔍 Matching sample 15\n",
      "  ➖ Sample 15: No positive matches\n",
      "🔍 Matching sample 16\n",
      "  ➖ Sample 16: No positive matches\n",
      "🔍 Matching sample 17\n",
      "  ➖ Sample 17: No positive matches\n",
      "🔍 Matching sample 18\n",
      "  ➖ Sample 18: No positive matches\n",
      "🔍 Matching sample 19\n",
      "  ➖ Sample 19: No positive matches\n",
      "🔍 Matching sample 20\n",
      "  ➖ Sample 20: No positive matches\n",
      "🔍 Matching sample 21\n",
      "  ➖ Sample 21: No positive matches\n",
      "🔍 Matching sample 22\n",
      "  ➖ Sample 22: No positive matches\n",
      "🔍 Matching sample 23\n",
      "  ➖ Sample 23: No positive matches\n",
      "🔍 Matching sample 24\n",
      "  ➖ Sample 24: No positive matches\n",
      "🔍 Matching sample 25\n",
      "  ➖ Sample 25: No positive matches\n",
      "🔍 Matching sample 26\n",
      "  ➖ Sample 26: No positive matches\n",
      "🔍 Matching sample 27\n",
      "  ➖ Sample 27: No positive matches\n",
      "🔍 Matching sample 28\n",
      "  ➖ Sample 28: No positive matches\n",
      "🔍 Matching sample 29\n",
      "  🎯 Matched to Human immunodeficiency virus 1: 0.316\n",
      "  ✓ Sample 29: 1 positive matches\n",
      "🔍 Matching sample 30\n",
      "  ➖ Sample 30: No positive matches\n",
      "🔍 Matching sample 31\n",
      "  ➖ Sample 31: No positive matches\n",
      "🔍 Matching sample 32\n",
      "  ➖ Sample 32: No positive matches\n",
      "🔍 Matching sample 33\n",
      "  ➖ Sample 33: No positive matches\n",
      "🔍 Matching sample 34\n",
      "  ➖ Sample 34: No positive matches\n",
      "🔍 Matching sample 35\n",
      "  ➖ Sample 35: No positive matches\n",
      "🔍 Matching sample 36\n",
      "  🎯 Matched to Escherichia coli: 0.100\n",
      "  🎯 Matched to Escherichia coli: 0.100\n",
      "  ✓ Sample 36: 2 positive matches\n",
      "🔍 Matching sample 37\n",
      "  ➖ Sample 37: No positive matches\n",
      "🔍 Matching sample 38\n",
      "  ➖ Sample 38: No positive matches\n",
      "🔍 Matching sample 39\n",
      "  ➖ Sample 39: No positive matches\n",
      "🔍 Matching sample 40\n",
      "  ➖ Sample 40: No positive matches\n",
      "🔍 Matching sample 41\n",
      "  ➖ Sample 41: No positive matches\n",
      "🔍 Matching sample 42\n",
      "  ➖ Sample 42: No positive matches\n",
      "🔍 Matching sample 43\n",
      "  ➖ Sample 43: No positive matches\n",
      "🔍 Matching sample 44\n",
      "  ➖ Sample 44: No positive matches\n",
      "🔍 Matching sample 45\n",
      "  ➖ Sample 45: No positive matches\n",
      "🔍 Matching sample 46\n",
      "  ➖ Sample 46: No positive matches\n",
      "🔍 Matching sample 47\n",
      "  ➖ Sample 47: No positive matches\n",
      "🔍 Matching sample 48\n",
      "  ➖ Sample 48: No positive matches\n",
      "🔍 Matching sample 49\n",
      "  🎯 Matched to Escherichia coli: 0.273\n",
      "  ✓ Sample 49: 1 positive matches\n",
      "🔍 Matching sample 50\n",
      "  ➖ Sample 50: No positive matches\n",
      "🔍 Matching sample 51\n",
      "  🎯 Matched to Escherichia coli: 0.273\n",
      "  ✓ Sample 51: 1 positive matches\n",
      "🔍 Matching sample 52\n",
      "  ➖ Sample 52: No positive matches\n",
      "🔍 Matching sample 53\n",
      "  🎯 Matched to Mycobacterium tuberculosis: 0.174\n",
      "  ✓ Sample 53: 1 positive matches\n",
      "🔍 Matching sample 54\n",
      "  ➖ Sample 54: No positive matches\n",
      "🔍 Matching sample 55\n",
      "  ➖ Sample 55: No positive matches\n",
      "🔍 Matching sample 56\n",
      "  ➖ Sample 56: No positive matches\n",
      "🔍 Matching sample 57\n",
      "  ➖ Sample 57: No positive matches\n",
      "🔍 Matching sample 58\n",
      "  ➖ Sample 58: No positive matches\n",
      "🔍 Matching sample 59\n",
      "  ➖ Sample 59: No positive matches\n",
      "🔍 Matching sample 60\n",
      "  ➖ Sample 60: No positive matches\n",
      "=== Generating Detection Report ===\n",
      "✓ HTML report generated: pathogen_detection_report_20250921_153551.html\n",
      "✓ PDF report generated: pathogen_detection_report_20250921_153551.pdf\n",
      "\n",
      "============================================================\n",
      "🎉 Detection Complete!\n",
      "📊 Total samples: 60\n",
      "🎯 Positive samples: 6\n",
      "🦠 Match count: 7\n",
      "📄 Report file: pathogen_detection_report_20250921_153551.html\n",
      "============================================================\n",
      "\n",
      "✅ Pathogen detection pipeline executed successfully!\n",
      "📁 Please check the generated HTML and PDF report files\n"
     ]
    }
   ],
   "source": [
    "#!/usr/bin/env python3\n",
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "Immunogenomics Pathogen Detection Pipeline\n",
    "Generate peptides from CDR3-MHC combinations and match against pathogen database\n",
    "\"\"\"\n",
    "\n",
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import subprocess\n",
    "import requests\n",
    "import gzip\n",
    "from io import StringIO\n",
    "from Bio import SeqIO\n",
    "from Bio.SeqRecord import SeqRecord\n",
    "from Bio.Seq import Seq\n",
    "import difflib\n",
    "from datetime import datetime\n",
    "import json\n",
    "from collections import defaultdict, Counter\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from matplotlib.font_manager import FontProperties\n",
    "import warnings\n",
    "import time\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Report generation modules\n",
    "from jinja2 import Template\n",
    "import base64\n",
    "from io import BytesIO\n",
    "try:\n",
    "    import weasyprint\n",
    "    WEASYPRINT_AVAILABLE = True\n",
    "except ImportError:\n",
    "    WEASYPRINT_AVAILABLE = False\n",
    "    print(\"Warning: weasyprint not available, PDF generation will be skipped\")\n",
    "\n",
    "class PathogenDetectionPipeline:\n",
    "    \"\"\"Pathogen Detection Pipeline\"\"\"\n",
    "    \n",
    "    def __init__(self, \n",
    "                 similarity_threshold=0.95,\n",
    "                 num_candidates=5,\n",
    "                 temperature=0.8,\n",
    "                 top_k=10,\n",
    "                 verbose=False):\n",
    "        \"\"\"\n",
    "        Initialize detection pipeline\n",
    "        \n",
    "        Args:\n",
    "            similarity_threshold: peptide matching similarity threshold\n",
    "            num_candidates: number of peptide candidates per CDR3-MHC combination\n",
    "            temperature: generation temperature\n",
    "            top_k: top-k sampling\n",
    "            verbose: whether to output detailed information\n",
    "        \"\"\"\n",
    "        self.similarity_threshold = similarity_threshold\n",
    "        self.num_candidates = num_candidates\n",
    "        self.temperature = temperature\n",
    "        self.top_k = top_k\n",
    "        self.verbose = verbose\n",
    "        \n",
    "        # Common pathogenic microorganisms information with real UniProt IDs\n",
    "        self.pathogens_info = {\n",
    "            'Mycobacterium_tuberculosis': {\n",
    "                'name': 'Mycobacterium tuberculosis',\n",
    "                'type': 'Bacteria',\n",
    "                'uniprot_proteome_id': 'UP000001584',\n",
    "                'description': 'Causes tuberculosis and other diseases'\n",
    "            },\n",
    "            'Salmonella_enterica': {\n",
    "                'name': 'Salmonella enterica',\n",
    "                'type': 'Bacteria', \n",
    "                'uniprot_proteome_id': 'UP000008962',\n",
    "                'description': 'Causes food poisoning and gastroenteritis'\n",
    "            },\n",
    "            'Escherichia_coli': {\n",
    "                'name': 'Escherichia coli',\n",
    "                'type': 'Bacteria',\n",
    "                'uniprot_proteome_id': 'UP000000625',\n",
    "                'description': 'Causes intestinal infections and sepsis'\n",
    "            },\n",
    "            'Staphylococcus_aureus': {\n",
    "                'name': 'Staphylococcus aureus',\n",
    "                'type': 'Bacteria',\n",
    "                'uniprot_proteome_id': 'UP000008816',\n",
    "                'description': 'Causes skin infections, pneumonia, etc.'\n",
    "            },\n",
    "            'Candida_albicans': {\n",
    "                'name': 'Candida albicans',\n",
    "                'type': 'Fungus',\n",
    "                'uniprot_proteome_id': 'UP000000559',\n",
    "                'description': 'Causes oral and genital infections'\n",
    "            },\n",
    "            'Aspergillus_fumigatus': {\n",
    "                'name': 'Aspergillus fumigatus',\n",
    "                'type': 'Fungus',\n",
    "                'uniprot_proteome_id': 'UP000002530',\n",
    "                'description': 'Causes pulmonary infections'\n",
    "            },\n",
    "            'Human_immunodeficiency_virus_1': {\n",
    "                'name': 'Human immunodeficiency virus 1',\n",
    "                'type': 'Virus',\n",
    "                'uniprot_proteome_id': 'UP000090981',\n",
    "                'description': 'Causes acquired immunodeficiency syndrome'\n",
    "            },\n",
    "            'Hepatitis_B_virus': {\n",
    "                'name': 'Hepatitis B virus',\n",
    "                'type': 'Virus',\n",
    "                'uniprot_proteome_id': 'UP000007884',\n",
    "                'description': 'Causes acute and chronic hepatitis'\n",
    "            },\n",
    "            'Epstein_Barr_virus': {\n",
    "                'name': 'Epstein-Barr virus',\n",
    "                'type': 'Virus',\n",
    "                'uniprot_proteome_id': 'UP000000552',\n",
    "                'description': 'Causes infectious mononucleosis'\n",
    "            },\n",
    "            'Cytomegalovirus': {\n",
    "                'name': 'Cytomegalovirus',\n",
    "                'type': 'Virus',\n",
    "                'uniprot_proteome_id': 'UP000000968',\n",
    "                'description': 'Causes congenital infections and immunodeficiency-related infections'\n",
    "            }\n",
    "        }\n",
    "        \n",
    "        self.pathogen_peptides = {}\n",
    "        self.results = []\n",
    "        \n",
    "    def setup_environment(self):\n",
    "        \"\"\"Setup environment and check dependencies\"\"\"\n",
    "        print(\"=== Environment Setup ===\")\n",
    "        \n",
    "        # Check and create fasta folder\n",
    "        if not os.path.exists('fasta'):\n",
    "            os.makedirs('fasta')\n",
    "            print(\"✓ Created fasta folder\")\n",
    "        else:\n",
    "            print(\"✓ fasta folder already exists\")\n",
    "            \n",
    "        # Check CDR3_MHC.csv file - must exist\n",
    "        if not os.path.exists('CDR3_MHC.csv'):\n",
    "            print(\"❌ CDR3_MHC.csv file not found!\")\n",
    "            print(\"Please ensure CDR3_MHC.csv file exists in the current directory\")\n",
    "            print(\"The file should contain columns: CDR3, MHC\")\n",
    "            return False\n",
    "        else:\n",
    "            print(\"✓ CDR3_MHC.csv file found\")\n",
    "            \n",
    "        # Check inference script\n",
    "        if not os.path.exists('infer.py'):\n",
    "            print(\"❌ infer.py file not found, please ensure it's in the current directory\")\n",
    "            return False\n",
    "        else:\n",
    "            print(\"✓ infer.py file found\")\n",
    "            \n",
    "        return True\n",
    "    \n",
    "    def download_pathogen_proteomes(self):\n",
    "        \"\"\"Download pathogen proteomes from UniProt\"\"\"\n",
    "        print(\"=== Downloading Pathogen Proteomes from UniProt ===\")\n",
    "        \n",
    "        for pathogen_id, info in self.pathogens_info.items():\n",
    "            fasta_file = f\"fasta/{pathogen_id}.fasta\"\n",
    "            \n",
    "            if os.path.exists(fasta_file):\n",
    "                print(f\"✓ {info['name']} proteome already exists\")\n",
    "                continue\n",
    "                \n",
    "            print(f\"📥 Downloading {info['name']} proteome from UniProt...\")\n",
    "            \n",
    "            try:\n",
    "                success = self.download_uniprot_proteome(\n",
    "                    info['uniprot_proteome_id'], \n",
    "                    fasta_file, \n",
    "                    info['name']\n",
    "                )\n",
    "                \n",
    "                if success:\n",
    "                    print(f\"✓ {info['name']} proteome download complete\")\n",
    "                else:\n",
    "                    print(f\"❌ Failed to download {info['name']} proteome\")\n",
    "                    \n",
    "            except Exception as e:\n",
    "                print(f\"❌ Error downloading {info['name']}: {str(e)}\")\n",
    "    \n",
    "    def download_uniprot_proteome(self, proteome_id, output_file, organism_name, max_retries=3):\n",
    "        \"\"\"Download proteome from UniProt using REST API\"\"\"\n",
    "        \n",
    "        # UniProt REST API URL for proteome download\n",
    "        base_url = \"https://rest.uniprot.org/uniprotkb/stream\"\n",
    "        params = {\n",
    "            'format': 'fasta',\n",
    "            'query': f'proteome:{proteome_id}',\n",
    "            'size': '500'  # Limit to 500 proteins for faster processing\n",
    "        }\n",
    "        \n",
    "        for attempt in range(max_retries):\n",
    "            try:\n",
    "                print(f\"  Attempt {attempt + 1}/{max_retries} for {organism_name}\")\n",
    "                \n",
    "                response = requests.get(base_url, params=params, timeout=30)\n",
    "                \n",
    "                if response.status_code == 200:\n",
    "                    # Check if we got valid FASTA content\n",
    "                    content = response.text\n",
    "                    if content.strip() and content.startswith('>'):\n",
    "                        with open(output_file, 'w') as f:\n",
    "                            f.write(content)\n",
    "                        \n",
    "                        # Verify the file was written correctly\n",
    "                        if os.path.exists(output_file) and os.path.getsize(output_file) > 0:\n",
    "                            # Count sequences\n",
    "                            seq_count = content.count('>')\n",
    "                            print(f\"  ✓ Downloaded {seq_count} protein sequences\")\n",
    "                            return True\n",
    "                        else:\n",
    "                            print(f\"  ❌ File write failed\")\n",
    "                            \n",
    "                    else:\n",
    "                        print(f\"  ❌ Invalid FASTA format received\")\n",
    "                        \n",
    "                elif response.status_code == 400:\n",
    "                    print(f\"  ❌ Bad request - invalid proteome ID: {proteome_id}\")\n",
    "                    return False\n",
    "                    \n",
    "                elif response.status_code == 404:\n",
    "                    print(f\"  ❌ Proteome not found: {proteome_id}\")\n",
    "                    return False\n",
    "                    \n",
    "                else:\n",
    "                    print(f\"  ⚠️ HTTP {response.status_code}: {response.reason}\")\n",
    "                    \n",
    "            except requests.exceptions.Timeout:\n",
    "                print(f\"  ⏰ Request timeout (attempt {attempt + 1})\")\n",
    "                \n",
    "            except requests.exceptions.RequestException as e:\n",
    "                print(f\"  ❌ Request error: {str(e)}\")\n",
    "                \n",
    "            except Exception as e:\n",
    "                print(f\"  ❌ Unexpected error: {str(e)}\")\n",
    "            \n",
    "            # Wait before retry\n",
    "            if attempt < max_retries - 1:\n",
    "                wait_time = 2 ** attempt  # Exponential backoff\n",
    "                print(f\"  ⏳ Waiting {wait_time} seconds before retry...\")\n",
    "                time.sleep(wait_time)\n",
    "        \n",
    "        # If all attempts failed, create a minimal fallback file\n",
    "        print(f\"  ⚠️ All download attempts failed, creating minimal fallback for {organism_name}\")\n",
    "        self.create_fallback_proteome(output_file, organism_name)\n",
    "        return True\n",
    "    \n",
    "    def create_fallback_proteome(self, output_file, organism_name):\n",
    "        \"\"\"Create a minimal fallback proteome when download fails\"\"\"\n",
    "        \n",
    "        # Some common protein sequences for different organism types\n",
    "        fallback_sequences = {\n",
    "            'Mycobacterium': [\n",
    "                '>sp|P0A564|RPOB_MYCTU RNA polymerase subunit beta OS=Mycobacterium tuberculosis\\nMTDAIGRTLVVTDGSPGAITNVLSRLAAADPQLPVAVATDHLRLLRRLLSDPARLTAELARGYLAERIDAALVRTDLLPLLRDRAAACAVP',\n",
    "                '>sp|P9WQB7|KATG_MYCTU Catalase-peroxidase OS=Mycobacterium tuberculosis\\nMSEAIHVLHPRLNPGDEGGGAGGGAPADRQLLRKLLATHLSVPEDIADQYRSLLDALDRLASARAKQDRQLLKRLAAELHRVPLPVGFPA'\n",
    "            ],\n",
    "            'Salmonella': [\n",
    "                '>sp|P0A7U7|RS11_SALEN 30S ribosomal protein S11 OS=Salmonella enterica\\nMKIRWLSLGLLIAGLAAPLHAQPQQGQTGLLSLQQQQAFQGPQYLQGPQRVLQLQQTQQAQLPSQPQRLQNQPQRVLQPQRVLQPQRVL',\n",
    "                '>sp|P0A7V8|RS7_SALEN 30S ribosomal protein S7 OS=Salmonella enterica\\nMSQIYDIRQLWNTNDACLPSGSKKPRKSPKQQVLSLFSLPAQDLVQDCVSTMQLQWQDKYLNSLLRGLKGSGNLQRPQQRMQAQAQQQR'\n",
    "            ],\n",
    "            'Escherichia': [\n",
    "                '>sp|P0AG44|RPOA_ECOLI DNA-directed RNA polymerase subunit alpha OS=Escherichia coli\\nMTQTKAVIVAARPGKMRVVVKSGNGVDIIGQLVPDVAAMGWDLHGQEVKVDPDGKTVVVVDSDPFNIGGPGVLHPEDAYRQLQKAGVQP',\n",
    "                '>sp|P0A7K6|RL7_ECOLI 50S ribosomal protein L7/L12 OS=Escherichia coli\\nMSAPQLARQQQQWVTHEVHQSLFQAENQHQQAAAQAAAQQHQQLLLQQQQQAQQGPQQAAHLQQAHHQQHQLLHHQHQQLLHSPPSTQ'\n",
    "            ],\n",
    "            'Staphylococcus': [\n",
    "                '>sp|P0A0J8|GYR1_STAAU DNA gyrase subunit A OS=Staphylococcus aureus\\nMNSLVHQVFGDYLTTDKIINKELTEAYGLTKHFSFGVNVKDKHPLKNGKKTFNLKGGTKKPVSLKGGKEIKIIGNGDLTFHHGGADAQL',\n",
    "                '>sp|P0A0J9|GYR2_STAAU DNA gyrase subunit B OS=Staphylococcus aureus\\nMKKRILLHCKIYRLPGRNPVIIVAENPQHDQEVQNHPGLVKIITNGNPQQQPQIIKIPQNKQHFKLLKGKDFILFSDEGFVKLGGGKKQ'\n",
    "            ],\n",
    "            'Candida': [\n",
    "                '>sp|P32375|HSP90_CANAL Heat shock protein 90 OS=Candida albicans\\nMSLVQGSQKFGAKGASGKQSLLTPRALTQKKNSFSFLKSQKQSLPPTTSFKTTKQKKVNPTAKKQQLYSQLKKRRWKTSFDKEAEKAIE',\n",
    "                '>sp|P40340|CDC42_CANAL Cell division control protein 42 homolog OS=Candida albicans\\nMSRWQGSQVFGDKGASGKDSLLTPRALTQKKTSFNFLKAQKQSLPPTSAFKTTKQKKVNPTAKKQQLYSQLQKRRWKTSFDKEAEKAIE'\n",
    "            ],\n",
    "            'Aspergillus': [\n",
    "                '>sp|Q4WTG4|HSP90_ASPFU Heat shock protein 90 OS=Aspergillus fumigatus\\nMSSQGAQLTPRALTQKRNSFGFLKAQKQSLPPTTSFKTTKQKKVNPTAKKQQLYSQLKKRRWKTSFDKEAEQAIEQRVSAIVEQKLEHP',\n",
    "                '>sp|Q4WA62|CDC42_ASPFU Cell division control protein 42 homolog OS=Aspergillus fumigatus\\nMSSQGAQKFGAKGASGKQSLLTPRALTQKKNSFNFLKAQKQSLPPTSSFKTTKQKKVNPTAKKQQLYSQLQKRRWKTSFDKEAEKAIER'\n",
    "            ],\n",
    "            'virus': [\n",
    "                '>sp|P04585|GAG_HV1H1 Gag polyprotein OS=Human immunodeficiency virus 1\\nMGARA SILRGGKLDKWEKIRLRPGGKKHYMLKHLVWASRELERFALNPGLLETSEGCKQILVQLQPSLQTGSEELRSLFNTVATLYCVH',\n",
    "                '>sp|P03070|POLG_HBV Genome polyprotein OS=Hepatitis B virus\\nMEHQVPYKSGRFMEQQRQFVKQLKPQILKLGCVYLFSDFHTKLVPSAVCFVGFVKRFTQKPPPVQKGDQKGADNAQISQYDLTFGLTFS'\n",
    "            ]\n",
    "        }\n",
    "        \n",
    "        # Determine organism type and select appropriate sequences\n",
    "        sequences = []\n",
    "        for org_type, seqs in fallback_sequences.items():\n",
    "            if org_type.lower() in organism_name.lower():\n",
    "                sequences = seqs\n",
    "                break\n",
    "        \n",
    "        # Use virus sequences as default fallback\n",
    "        if not sequences:\n",
    "            sequences = fallback_sequences['virus']\n",
    "        \n",
    "        # Write fallback sequences\n",
    "        with open(output_file, 'w') as f:\n",
    "            f.write('\\n'.join(sequences))\n",
    "            f.write('\\n')\n",
    "        \n",
    "        print(f\"  ✓ Created fallback proteome with {len(sequences)} sequences\")\n",
    "    \n",
    "    def load_pathogen_database(self):\n",
    "        \"\"\"Load pathogen peptide database\"\"\"\n",
    "        print(\"=== Building Pathogen Peptide Database ===\")\n",
    "        \n",
    "        for pathogen_id, info in self.pathogens_info.items():\n",
    "            fasta_file = f\"fasta/{pathogen_id}.fasta\"\n",
    "            \n",
    "            if not os.path.exists(fasta_file):\n",
    "                print(f\"❌ {info['name']} proteome file does not exist\")\n",
    "                continue\n",
    "            \n",
    "            peptides = set()\n",
    "            try:\n",
    "                with open(fasta_file, 'r') as f:\n",
    "                    for record in SeqIO.parse(f, 'fasta'):\n",
    "                        # Split protein sequences into overlapping peptides\n",
    "                        protein_seq = str(record.seq).upper()\n",
    "                        \n",
    "                        # Generate 9-mer peptides\n",
    "                        for i in range(len(protein_seq) - 8):\n",
    "                            peptide = protein_seq[i:i+9]\n",
    "                            if len(peptide) == 9 and peptide.isalpha() and 'X' not in peptide:\n",
    "                                peptides.add(peptide)\n",
    "                        \n",
    "                        # Generate 10-mer peptides\n",
    "                        for i in range(len(protein_seq) - 9):\n",
    "                            peptide = protein_seq[i:i+10]\n",
    "                            if len(peptide) == 10 and peptide.isalpha() and 'X' not in peptide:\n",
    "                                peptides.add(peptide)\n",
    "                        \n",
    "                        # Generate 11-mer peptides for better coverage\n",
    "                        for i in range(len(protein_seq) - 10):\n",
    "                            peptide = protein_seq[i:i+11]\n",
    "                            if len(peptide) == 11 and peptide.isalpha() and 'X' not in peptide:\n",
    "                                peptides.add(peptide)\n",
    "                \n",
    "                self.pathogen_peptides[pathogen_id] = peptides\n",
    "                print(f\"✓ {info['name']}: {len(peptides)} peptides\")\n",
    "                \n",
    "            except Exception as e:\n",
    "                print(f\"❌ Error processing {info['name']}: {str(e)}\")\n",
    "        \n",
    "        total_peptides = sum(len(peptides) for peptides in self.pathogen_peptides.values())\n",
    "        print(f\"✓ Total loaded {total_peptides} peptides from {len(self.pathogen_peptides)} organisms\")\n",
    "    \n",
    "    def generate_peptides_from_cdr3(self, cdr3_data):\n",
    "        \"\"\"Generate peptides using inference script\"\"\"\n",
    "        print(\"=== Generating Peptide Candidates ===\")\n",
    "        \n",
    "        all_generated_peptides = []\n",
    "        \n",
    "        for idx, row in cdr3_data.iterrows():\n",
    "            cdr3 = row['CDR3']\n",
    "            mhc = row['MHC']\n",
    "            \n",
    "            print(f\"🔬 Processing sample {idx+1}/{len(cdr3_data)}\")\n",
    "            \n",
    "            # Call inference script\n",
    "            cmd = [\n",
    "                'python', 'infer.py',\n",
    "                '--task', 'PEP_GEN',\n",
    "                '--mhc', mhc,\n",
    "                '--tcr', cdr3,  # Use CDR3 as TCR input\n",
    "                '--num_candidates', str(self.num_candidates),\n",
    "                '--temperature', str(self.temperature),\n",
    "                '--top_k', str(self.top_k)\n",
    "            ]\n",
    "            \n",
    "            if self.verbose:\n",
    "                cmd.append('--verbose')\n",
    "            \n",
    "            try:\n",
    "                result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)\n",
    "                \n",
    "                if result.returncode == 0:\n",
    "                    # Parse output results\n",
    "                    output_lines = result.stdout.strip().split('\\n')\n",
    "                    candidates = []\n",
    "                    \n",
    "                    for line in output_lines:\n",
    "                        if line.startswith('Candidate'):\n",
    "                            # Extract peptide sequence\n",
    "                            parts = line.split(': ')\n",
    "                            if len(parts) >= 2:\n",
    "                                peptide = parts[1].strip()\n",
    "                                if peptide and len(peptide) >= 8:  # Valid peptide\n",
    "                                    candidates.append(peptide)\n",
    "                    \n",
    "                    # Save results\n",
    "                    sample_result = {\n",
    "                        'sample_id': idx,\n",
    "                        'cdr3': cdr3,\n",
    "                        'mhc': mhc,\n",
    "                        'generated_peptides': candidates,\n",
    "                        'num_generated': len(candidates)\n",
    "                    }\n",
    "                    \n",
    "                    all_generated_peptides.append(sample_result)\n",
    "                    print(f\"  ✓ Generated {len(candidates)} peptide candidates\")\n",
    "                    \n",
    "                else:\n",
    "                    print(f\"  ❌ Inference failed: {result.stderr}\")\n",
    "                    \n",
    "            except subprocess.TimeoutExpired:\n",
    "                print(f\"  ⏰ Inference timeout\")\n",
    "            except Exception as e:\n",
    "                print(f\"  ❌ Inference error: {str(e)}\")\n",
    "        \n",
    "        return all_generated_peptides\n",
    "    \n",
    "    def calculate_sequence_similarity(self, seq1, seq2):\n",
    "        \"\"\"Calculate sequence similarity\"\"\"\n",
    "        if len(seq1) != len(seq2):\n",
    "            # For sequences of different lengths, use longest common subsequence similarity\n",
    "            matcher = difflib.SequenceMatcher(None, seq1, seq2)\n",
    "            return matcher.ratio()\n",
    "        else:\n",
    "            # Direct comparison for sequences of same length\n",
    "            matches = sum(a == b for a, b in zip(seq1, seq2))\n",
    "            return matches / len(seq1)\n",
    "    \n",
    "    def match_against_pathogens(self, generated_data):\n",
    "        \"\"\"Match peptides against pathogen database\"\"\"\n",
    "        print(\"=== Pathogen Matching Analysis ===\")\n",
    "        \n",
    "        match_results = []\n",
    "        \n",
    "        for sample in generated_data:\n",
    "            sample_id = sample['sample_id']\n",
    "            generated_peptides = sample['generated_peptides']\n",
    "            \n",
    "            print(f\"🔍 Matching sample {sample_id + 1}\")\n",
    "            \n",
    "            sample_matches = []\n",
    "            \n",
    "            for peptide in generated_peptides:\n",
    "                for pathogen_id, pathogen_peptides in self.pathogen_peptides.items():\n",
    "                    \n",
    "                    best_similarity = 0\n",
    "                    best_match = None\n",
    "                    \n",
    "                    for pathogen_peptide in pathogen_peptides:\n",
    "                        similarity = self.calculate_sequence_similarity(peptide, pathogen_peptide)\n",
    "                        \n",
    "                        if similarity >= self.similarity_threshold and similarity > best_similarity:\n",
    "                            best_similarity = similarity\n",
    "                            best_match = pathogen_peptide\n",
    "                    \n",
    "                    if best_match:\n",
    "                        match_info = {\n",
    "                            'sample_id': sample_id,\n",
    "                            'generated_peptide': peptide,\n",
    "                            'pathogen_id': pathogen_id,\n",
    "                            'pathogen_name': self.pathogens_info[pathogen_id]['name'],\n",
    "                            'pathogen_type': self.pathogens_info[pathogen_id]['type'],\n",
    "                            'matched_peptide': best_match,\n",
    "                            'similarity': best_similarity,\n",
    "                            'description': self.pathogens_info[pathogen_id]['description']\n",
    "                        }\n",
    "                        sample_matches.append(match_info)\n",
    "                        print(f\"  🎯 Matched to {self.pathogens_info[pathogen_id]['name']}: {similarity:.3f}\")\n",
    "            \n",
    "            if sample_matches:\n",
    "                match_results.extend(sample_matches)\n",
    "                print(f\"  ✓ Sample {sample_id + 1}: {len(sample_matches)} positive matches\")\n",
    "            else:\n",
    "                print(f\"  ➖ Sample {sample_id + 1}: No positive matches\")\n",
    "        \n",
    "        return match_results\n",
    "    \n",
    "    def generate_visualizations(self, match_results, cdr3_data):\n",
    "        \"\"\"Generate visualization charts\"\"\"\n",
    "        plt.style.use('default')\n",
    "        \n",
    "        # Set font for plots\n",
    "        plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial', 'sans-serif']\n",
    "        plt.rcParams['axes.unicode_minus'] = False\n",
    "        \n",
    "        fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n",
    "        fig.suptitle('Pathogen Detection Results Analysis', fontsize=16, fontweight='bold')\n",
    "        \n",
    "        # 1. Positive detection rate\n",
    "        total_samples = len(cdr3_data)\n",
    "        positive_samples = len(set([m['sample_id'] for m in match_results]))\n",
    "        negative_samples = total_samples - positive_samples\n",
    "        \n",
    "        ax1 = axes[0, 0]\n",
    "        labels = ['Positive', 'Negative']\n",
    "        sizes = [positive_samples, negative_samples]\n",
    "        colors = ['#ff6b6b', '#4ecdc4']\n",
    "        ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)\n",
    "        ax1.set_title('Sample Detection Results', fontweight='bold')\n",
    "        \n",
    "        # 2. Pathogen type distribution\n",
    "        if match_results:\n",
    "            pathogen_types = [m['pathogen_type'] for m in match_results]\n",
    "            type_counts = Counter(pathogen_types)\n",
    "            \n",
    "            ax2 = axes[0, 1]\n",
    "            bars = ax2.bar(type_counts.keys(), type_counts.values(), \n",
    "                          color=['#ff9f43', '#00d2d3', '#ff6348'])\n",
    "            ax2.set_title('Detected Pathogen Type Distribution', fontweight='bold')\n",
    "            ax2.set_ylabel('Detection Count')\n",
    "            \n",
    "            # Add value labels\n",
    "            for bar in bars:\n",
    "                height = bar.get_height()\n",
    "                ax2.text(bar.get_x() + bar.get_width()/2., height + 0.1,\n",
    "                        f'{int(height)}', ha='center', va='bottom')\n",
    "        \n",
    "        # 3. Similarity distribution\n",
    "        if match_results:\n",
    "            similarities = [m['similarity'] for m in match_results]\n",
    "            \n",
    "            ax3 = axes[1, 0]\n",
    "            ax3.hist(similarities, bins=20, color='#74b9ff', alpha=0.7, edgecolor='black')\n",
    "            ax3.set_title('Peptide Similarity Distribution', fontweight='bold')\n",
    "            ax3.set_xlabel('Similarity')\n",
    "            ax3.set_ylabel('Frequency')\n",
    "            ax3.axvline(self.similarity_threshold, color='red', linestyle='--', \n",
    "                       label=f'Threshold ({self.similarity_threshold})')\n",
    "            ax3.legend()\n",
    "        \n",
    "        # 4. CDR3 sequence length distribution\n",
    "        cdr3_lengths = [len(row['CDR3']) for _, row in cdr3_data.iterrows()]\n",
    "        positive_lengths = []\n",
    "        negative_lengths = []\n",
    "        \n",
    "        for idx, length in enumerate(cdr3_lengths):\n",
    "            is_positive = any(m['sample_id'] == idx for m in match_results)\n",
    "            if is_positive:\n",
    "                positive_lengths.append(length)\n",
    "            else:\n",
    "                negative_lengths.append(length)\n",
    "        \n",
    "        ax4 = axes[1, 1]\n",
    "        if positive_lengths:\n",
    "            ax4.hist(positive_lengths, bins=10, alpha=0.7, label='Positive', color='red')\n",
    "        if negative_lengths:\n",
    "            ax4.hist(negative_lengths, bins=10, alpha=0.7, label='Negative', color='blue')\n",
    "        \n",
    "        ax4.set_title('CDR3 Length Distribution', fontweight='bold')\n",
    "        ax4.set_xlabel('CDR3 Length')\n",
    "        ax4.set_ylabel('Frequency')\n",
    "        ax4.legend()\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        \n",
    "        # Save chart\n",
    "        chart_buffer = BytesIO()\n",
    "        plt.savefig(chart_buffer, format='png', dpi=300, bbox_inches='tight')\n",
    "        chart_buffer.seek(0)\n",
    "        chart_base64 = base64.b64encode(chart_buffer.getvalue()).decode()\n",
    "        plt.close()\n",
    "        \n",
    "        return chart_base64\n",
    "    \n",
    "    def generate_report(self, cdr3_data, generated_data, match_results):\n",
    "        \"\"\"Generate detailed report\"\"\"\n",
    "        print(\"=== Generating Detection Report ===\")\n",
    "        \n",
    "        # Generate charts\n",
    "        chart_base64 = self.generate_visualizations(match_results, cdr3_data)\n",
    "        \n",
    "        # Statistics\n",
    "        total_samples = len(cdr3_data)\n",
    "        positive_samples = len(set([m['sample_id'] for m in match_results]))\n",
    "        total_matches = len(match_results)\n",
    "        \n",
    "        pathogen_summary = {}\n",
    "        for match in match_results:\n",
    "            pathogen_name = match['pathogen_name']\n",
    "            if pathogen_name not in pathogen_summary:\n",
    "                pathogen_summary[pathogen_name] = {\n",
    "                    'count': 0,\n",
    "                    'type': match['pathogen_type'],\n",
    "                    'samples': set(),\n",
    "                    'avg_similarity': 0,\n",
    "                    'similarities': [],\n",
    "                    'generated_epitopes': set()  # 新增：存储生成的表位\n",
    "                }\n",
    "            pathogen_summary[pathogen_name]['count'] += 1\n",
    "            pathogen_summary[pathogen_name]['samples'].add(match['sample_id'])\n",
    "            pathogen_summary[pathogen_name]['similarities'].append(match['similarity'])\n",
    "            pathogen_summary[pathogen_name]['generated_epitopes'].add(match['generated_peptide'])  # 新增\n",
    "        \n",
    "        # Calculate average similarity and convert sets to lists\n",
    "        for pathogen in pathogen_summary:\n",
    "            similarities = pathogen_summary[pathogen]['similarities']\n",
    "            pathogen_summary[pathogen]['avg_similarity'] = np.mean(similarities)\n",
    "            pathogen_summary[pathogen]['sample_count'] = len(pathogen_summary[pathogen]['samples'])\n",
    "            pathogen_summary[pathogen]['generated_epitopes'] = list(pathogen_summary[pathogen]['generated_epitopes'])  # 转换为list以便模板使用\n",
    "        \n",
    "        # HTML template\n",
    "        html_template = Template(\"\"\"\n",
    "<!DOCTYPE html>\n",
    "<html lang=\"en\">\n",
    "<head>\n",
    "    <meta charset=\"UTF-8\">\n",
    "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n",
    "    <title>Pathogen Detection Report</title>\n",
    "    <style>\n",
    "        body {\n",
    "            font-family: 'Arial', 'Helvetica', sans-serif;\n",
    "            line-height: 1.6;\n",
    "            margin: 0;\n",
    "            padding: 20px;\n",
    "            background-color: #f5f5f5;\n",
    "        }\n",
    "        .container {\n",
    "            max-width: 1400px;\n",
    "            margin: 0 auto;\n",
    "            background: white;\n",
    "            padding: 30px;\n",
    "            border-radius: 10px;\n",
    "            box-shadow: 0 0 20px rgba(0,0,0,0.1);\n",
    "        }\n",
    "        .header {\n",
    "            text-align: center;\n",
    "            border-bottom: 3px solid #4CAF50;\n",
    "            padding-bottom: 20px;\n",
    "            margin-bottom: 30px;\n",
    "        }\n",
    "        .header h1 {\n",
    "            color: #2c3e50;\n",
    "            margin: 0;\n",
    "            font-size: 2.5em;\n",
    "        }\n",
    "        .header .subtitle {\n",
    "            color: #7f8c8d;\n",
    "            font-size: 1.2em;\n",
    "            margin-top: 10px;\n",
    "        }\n",
    "        .summary {\n",
    "            display: grid;\n",
    "            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));\n",
    "            gap: 20px;\n",
    "            margin-bottom: 30px;\n",
    "        }\n",
    "        .summary-card {\n",
    "            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n",
    "            color: white;\n",
    "            padding: 20px;\n",
    "            border-radius: 10px;\n",
    "            text-align: center;\n",
    "        }\n",
    "        .summary-card.positive {\n",
    "            background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%);\n",
    "        }\n",
    "        .summary-card.negative {\n",
    "            background: linear-gradient(135deg, #4ecdc4 0%, #44a08d 100%);\n",
    "        }\n",
    "        .summary-card h3 {\n",
    "            margin: 0 0 10px 0;\n",
    "            font-size: 1.1em;\n",
    "        }\n",
    "        .summary-card .number {\n",
    "            font-size: 2.5em;\n",
    "            font-weight: bold;\n",
    "        }\n",
    "        .chart-container {\n",
    "            text-align: center;\n",
    "            margin: 30px 0;\n",
    "        }\n",
    "        .chart-container img {\n",
    "            max-width: 100%;\n",
    "            border-radius: 10px;\n",
    "            box-shadow: 0 4px 8px rgba(0,0,0,0.1);\n",
    "        }\n",
    "        .section {\n",
    "            margin: 30px 0;\n",
    "        }\n",
    "        .section h2 {\n",
    "            color: #2c3e50;\n",
    "            border-left: 4px solid #4CAF50;\n",
    "            padding-left: 15px;\n",
    "            margin-bottom: 20px;\n",
    "        }\n",
    "        .pathogen-table, .sample-table {\n",
    "            width: 100%;\n",
    "            border-collapse: collapse;\n",
    "            margin: 20px 0;\n",
    "            background: white;\n",
    "            box-shadow: 0 2px 5px rgba(0,0,0,0.1);\n",
    "            font-size: 0.9em;\n",
    "        }\n",
    "        .pathogen-table th, .pathogen-table td,\n",
    "        .sample-table th, .sample-table td {\n",
    "            border: 1px solid #ddd;\n",
    "            padding: 10px;\n",
    "            text-align: left;\n",
    "            vertical-align: top;\n",
    "        }\n",
    "        .pathogen-table th, .sample-table th {\n",
    "            background-color: #4CAF50;\n",
    "            color: white;\n",
    "            font-weight: bold;\n",
    "        }\n",
    "        .pathogen-table tr:nth-child(even),\n",
    "        .sample-table tr:nth-child(even) {\n",
    "            background-color: #f9f9f9;\n",
    "        }\n",
    "        .pathogen-table tr:hover,\n",
    "        .sample-table tr:hover {\n",
    "            background-color: #f5f5f5;\n",
    "        }\n",
    "        .positive-badge {\n",
    "            background-color: #ff6b6b;\n",
    "            color: white;\n",
    "            padding: 4px 8px;\n",
    "            border-radius: 4px;\n",
    "            font-size: 0.9em;\n",
    "        }\n",
    "        .negative-badge {\n",
    "            background-color: #4ecdc4;\n",
    "            color: white;\n",
    "            padding: 4px 8px;\n",
    "            border-radius: 4px;\n",
    "            font-size: 0.9em;\n",
    "        }\n",
    "        .bacteria { background-color: #fff3cd; }\n",
    "        .virus { background-color: #d1ecf1; }\n",
    "        .fungus { background-color: #d4edda; }\n",
    "        .sequence {\n",
    "            font-family: monospace;\n",
    "            font-size: 0.85em;\n",
    "            background-color: #f8f9fa;\n",
    "            padding: 2px 4px;\n",
    "            border-radius: 3px;\n",
    "            margin: 1px;\n",
    "            display: inline-block;\n",
    "        }\n",
    "        .epitope-list {\n",
    "            max-height: 100px;\n",
    "            overflow-y: auto;\n",
    "            border: 1px solid #ddd;\n",
    "            padding: 5px;\n",
    "            background-color: #f8f9fa;\n",
    "            border-radius: 3px;\n",
    "        }\n",
    "        .parameters {\n",
    "            background-color: #e9ecef;\n",
    "            padding: 15px;\n",
    "            border-radius: 5px;\n",
    "            margin: 15px 0;\n",
    "            display: grid;\n",
    "            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));\n",
    "            gap: 10px;\n",
    "        }\n",
    "        .param-item {\n",
    "            background: white;\n",
    "            padding: 10px;\n",
    "            border-radius: 5px;\n",
    "            text-align: center;\n",
    "        }\n",
    "        .param-value {\n",
    "            font-size: 1.5em;\n",
    "            font-weight: bold;\n",
    "            color: #2c3e50;\n",
    "        }\n",
    "        .generated-peptide {\n",
    "            color: #007bff;\n",
    "            font-weight: bold;\n",
    "        }\n",
    "        .matched-peptide {\n",
    "            color: #28a745;\n",
    "        }\n",
    "    </style>\n",
    "</head>\n",
    "<body>\n",
    "    <div class=\"container\">\n",
    "        <div class=\"header\">\n",
    "            <h1>Pathogen Detection Report</h1>\n",
    "            <div class=\"subtitle\">{{ report_time }}</div>\n",
    "        </div>\n",
    "\n",
    "        <div class=\"summary\">\n",
    "            <div class=\"summary-card\">\n",
    "                <h3>Total Samples</h3>\n",
    "                <div class=\"number\">{{ total_samples }}</div>\n",
    "            </div>\n",
    "            <div class=\"summary-card positive\">\n",
    "                <h3>Positive Samples</h3>\n",
    "                <div class=\"number\">{{ positive_samples }}</div>\n",
    "            </div>\n",
    "            <div class=\"summary-card negative\">\n",
    "                <h3>Negative Samples</h3>\n",
    "                <div class=\"number\">{{ negative_samples }}</div>\n",
    "            </div>\n",
    "            <div class=\"summary-card\">\n",
    "                <h3>Positive Rate</h3>\n",
    "                <div class=\"number\">{{ positive_rate }}%</div>\n",
    "            </div>\n",
    "        </div>\n",
    "\n",
    "        <div class=\"parameters\">\n",
    "            <div class=\"param-item\">\n",
    "                <div>Similarity Threshold</div>\n",
    "                <div class=\"param-value\">{{ similarity_threshold }}</div>\n",
    "            </div>\n",
    "            <div class=\"param-item\">\n",
    "                <div>Candidate Peptides</div>\n",
    "                <div class=\"param-value\">{{ num_candidates }}</div>\n",
    "            </div>\n",
    "            <div class=\"param-item\">\n",
    "                <div>Temperature</div>\n",
    "                <div class=\"param-value\">{{ temperature }}</div>\n",
    "            </div>\n",
    "            <div class=\"param-item\">\n",
    "                <div>Top-k Sampling</div>\n",
    "                <div class=\"param-value\">{{ top_k }}</div>\n",
    "            </div>\n",
    "            <div class=\"param-item\">\n",
    "                <div>Pathogen Count</div>\n",
    "                <div class=\"param-value\">{{ pathogen_count }}</div>\n",
    "            </div>\n",
    "        </div>\n",
    "\n",
    "        <div class=\"chart-container\">\n",
    "            <h2>Analysis Results</h2>\n",
    "            <img src=\"data:image/png;base64,{{ chart_base64 }}\" alt=\"Detection Results Charts\">\n",
    "        </div>\n",
    "\n",
    "        {% if pathogen_summary %}\n",
    "        <div class=\"section\">\n",
    "            <h2>Detected Pathogens</h2>\n",
    "            <table class=\"pathogen-table\">\n",
    "                <thead>\n",
    "                    <tr>\n",
    "                        <th>Pathogen Name</th>\n",
    "                        <th>Type</th>\n",
    "                        <th>Samples</th>\n",
    "                        <th>Matches</th>\n",
    "                        <th>Avg Similarity</th>\n",
    "                        <th>Generated Epitopes</th>\n",
    "                    </tr>\n",
    "                </thead>\n",
    "                <tbody>\n",
    "                    {% for pathogen_name, info in pathogen_summary.items() %}\n",
    "                    <tr class=\"{{ info.type.lower() }}\">\n",
    "                        <td><strong>{{ pathogen_name }}</strong></td>\n",
    "                        <td>{{ info.type }}</td>\n",
    "                        <td>{{ info.sample_count }}</td>\n",
    "                        <td>{{ info.count }}</td>\n",
    "                        <td>{{ \"%.3f\"|format(info.avg_similarity) }}</td>\n",
    "                        <td>\n",
    "                            <div class=\"epitope-list\">\n",
    "                                {% for epitope in info.generated_epitopes %}\n",
    "                                <span class=\"sequence\">{{ epitope }}</span>\n",
    "                                {% endfor %}\n",
    "                            </div>\n",
    "                        </td>\n",
    "                    </tr>\n",
    "                    {% endfor %}\n",
    "                </tbody>\n",
    "            </table>\n",
    "        </div>\n",
    "        {% endif %}\n",
    "\n",
    "        <div class=\"section\">\n",
    "            <h2>Sample Results</h2>\n",
    "            <table class=\"sample-table\">\n",
    "                <thead>\n",
    "                    <tr>\n",
    "                        <th>Sample ID</th>\n",
    "                        <th>CDR3 Sequence</th>\n",
    "                        <th>Status</th>\n",
    "                        <th>Pathogen</th>\n",
    "                        <th>Generated Epitope</th>\n",
    "                        <th>Matched Peptide</th>\n",
    "                        <th>Similarity</th>\n",
    "                    </tr>\n",
    "                </thead>\n",
    "                <tbody>\n",
    "                    {% for i in range(total_samples) %}\n",
    "                    {% set sample_matches = matches_by_sample.get(i, []) %}\n",
    "                    {% if sample_matches %}\n",
    "                        {% for match in sample_matches %}\n",
    "                        <tr>\n",
    "                            {% if loop.first %}\n",
    "                            <td rowspan=\"{{ sample_matches|length }}\">{{ i + 1 }}</td>\n",
    "                            <td rowspan=\"{{ sample_matches|length }}\"><span class=\"sequence\">{{ cdr3_data.iloc[i]['CDR3'] }}</span></td>\n",
    "                            <td rowspan=\"{{ sample_matches|length }}\"><span class=\"positive-badge\">Positive</span></td>\n",
    "                            {% endif %}\n",
    "                            <td>{{ match.pathogen_name }}</td>\n",
    "                            <td><span class=\"sequence generated-peptide\">{{ match.generated_peptide }}</span></td>\n",
    "                            <td><span class=\"sequence matched-peptide\">{{ match.matched_peptide }}</span></td>\n",
    "                            <td>{{ \"%.3f\"|format(match.similarity) }}</td>\n",
    "                        </tr>\n",
    "                        {% endfor %}\n",
    "                    {% else %}\n",
    "                    <tr>\n",
    "                        <td>{{ i + 1 }}</td>\n",
    "                        <td><span class=\"sequence\">{{ cdr3_data.iloc[i]['CDR3'] }}</span></td>\n",
    "                        <td><span class=\"negative-badge\">Negative</span></td>\n",
    "                        <td>-</td>\n",
    "                        <td>-</td>\n",
    "                        <td>-</td>\n",
    "                        <td>-</td>\n",
    "                    </tr>\n",
    "                    {% endif %}\n",
    "                    {% endfor %}\n",
    "                </tbody>\n",
    "            </table>\n",
    "        </div>\n",
    "    </div>\n",
    "</body>\n",
    "</html>\n",
    "        \"\"\")\n",
    "        \n",
    "        # Prepare template data\n",
    "        matches_by_sample = defaultdict(list)\n",
    "        for match in match_results:\n",
    "            matches_by_sample[match['sample_id']].append(match)\n",
    "        \n",
    "        template_data = {\n",
    "            'report_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\n",
    "            'total_samples': total_samples,\n",
    "            'positive_samples': positive_samples,\n",
    "            'negative_samples': total_samples - positive_samples,\n",
    "            'positive_rate': round(positive_samples / total_samples * 100, 1),\n",
    "            'pathogen_summary': pathogen_summary,\n",
    "            'matches_by_sample': matches_by_sample,\n",
    "            'cdr3_data': cdr3_data,\n",
    "            'chart_base64': chart_base64,\n",
    "            'similarity_threshold': self.similarity_threshold,\n",
    "            'num_candidates': self.num_candidates,\n",
    "            'temperature': self.temperature,\n",
    "            'top_k': self.top_k,\n",
    "            'pathogen_count': len(self.pathogens_info)\n",
    "        }\n",
    "        \n",
    "        # Generate HTML report\n",
    "        html_content = html_template.render(**template_data)\n",
    "        \n",
    "        # Save HTML file\n",
    "        html_filename = f\"pathogen_detection_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html\"\n",
    "        with open(html_filename, 'w', encoding='utf-8') as f:\n",
    "            f.write(html_content)\n",
    "        \n",
    "        print(f\"✓ HTML report generated: {html_filename}\")\n",
    "        \n",
    "        # Generate PDF report (if weasyprint is available)\n",
    "        if WEASYPRINT_AVAILABLE:\n",
    "            try:\n",
    "                pdf_filename = html_filename.replace('.html', '.pdf')\n",
    "                weasyprint.HTML(string=html_content).write_pdf(pdf_filename)\n",
    "                print(f\"✓ PDF report generated: {pdf_filename}\")\n",
    "            except Exception as e:\n",
    "                print(f\"⚠️  PDF generation failed: {str(e)}\")\n",
    "        \n",
    "        return html_filename\n",
    "    \n",
    "    def run_pipeline(self):\n",
    "        \"\"\"Run complete detection pipeline\"\"\"\n",
    "        print(\"🧬 Starting Immunogenomics Pathogen Detection Pipeline\")\n",
    "        print(\"=\" * 60)\n",
    "        \n",
    "        # 1. Environment setup\n",
    "        if not self.setup_environment():\n",
    "            return False\n",
    "        \n",
    "        # 2. Download pathogen data\n",
    "        self.download_pathogen_proteomes()\n",
    "        \n",
    "        # 3. Build pathogen database\n",
    "        self.load_pathogen_database()\n",
    "        \n",
    "        # 4. Load CDR3 data\n",
    "        print(\"=== Loading CDR3-MHC Data ===\")\n",
    "        try:\n",
    "            cdr3_data = pd.read_csv('CDR3_MHC.csv')\n",
    "            print(f\"✓ Loaded {len(cdr3_data)} CDR3-MHC samples\")\n",
    "            \n",
    "            # Validate data format\n",
    "            required_columns = ['CDR3', 'MHC']\n",
    "            if not all(col in cdr3_data.columns for col in required_columns):\n",
    "                print(f\"❌ Missing required columns. Expected: {required_columns}\")\n",
    "                print(f\"Found columns: {list(cdr3_data.columns)}\")\n",
    "                return False\n",
    "                \n",
    "        except Exception as e:\n",
    "            print(f\"❌ Error loading CDR3_MHC.csv: {str(e)}\")\n",
    "            return False\n",
    "        \n",
    "        # 5. Generate peptides\n",
    "        generated_data = self.generate_peptides_from_cdr3(cdr3_data)\n",
    "        \n",
    "        if not generated_data:\n",
    "            print(\"❌ Failed to generate any peptides, detection terminated\")\n",
    "            return False\n",
    "        \n",
    "        # 6. Pathogen matching\n",
    "        match_results = self.match_against_pathogens(generated_data)\n",
    "        \n",
    "        # 7. Generate report\n",
    "        report_file = self.generate_report(cdr3_data, generated_data, match_results)\n",
    "        \n",
    "        # 8. Summary\n",
    "        print(\"\\n\" + \"=\" * 60)\n",
    "        print(\"🎉 Detection Complete!\")\n",
    "        print(f\"📊 Total samples: {len(cdr3_data)}\")\n",
    "        print(f\"🎯 Positive samples: {len(set([m['sample_id'] for m in match_results]))}\")\n",
    "        print(f\"🦠 Match count: {len(match_results)}\")\n",
    "        print(f\"📄 Report file: {report_file}\")\n",
    "        print(\"=\" * 60)\n",
    "        \n",
    "        return True\n",
    "\n",
    "# Usage example\n",
    "def main():\n",
    "    \"\"\"Main function\"\"\"\n",
    "    # Create detection pipeline\n",
    "    pipeline = PathogenDetectionPipeline(\n",
    "        similarity_threshold=0.65,  # 65% similarity threshold\n",
    "        num_candidates=20,          # Generate 20 candidate peptides per combination\n",
    "        temperature=0.8,           # Generation temperature\n",
    "        top_k=5,                   # Top-k sampling\n",
    "        verbose=False              # Don't show detailed generation process\n",
    "    )\n",
    "    \n",
    "    # Run detection pipeline\n",
    "    success = pipeline.run_pipeline()\n",
    "    \n",
    "    if success:\n",
    "        print(\"\\n✅ Pathogen detection pipeline executed successfully!\")\n",
    "        print(\"📁 Please check the generated HTML and PDF report files\")\n",
    "    else:\n",
    "        print(\"\\n❌ Pathogen detection pipeline execution failed\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "442f3bfb-4c04-4f9e-b7f7-b02ce7de6f52",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "environment": {
   "kernel": "conda-base-py",
   "name": "workbench-notebooks.m128",
   "type": "gcloud",
   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m128"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel) (Local)",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
