"""Peptide tokenizer ignoring any tokens with equal mass"""

from collections.abc import Iterable

from depthcharge.tokenizers import PeptideTokenizer
from depthcharge.primitives import Peptide

import re

MSKB_TO_UNIMOD = {
    "+42.011": "[Acetyl]-",
    "+43.006": "[Carbamyl]-",
    "-17.027": "[Ammonia-loss]-",
    "+43.006-17.027": "[+25.980265]-",  # Not in Unimod
    "M+15.995": "M[Oxidation]",
    "N+0.984": "N[Deamidated]",
    "Q+0.984": "Q[Deamidated]",
    "C+57.021": "C[Carbamidomethyl]",
    "K+114.043": "K[GG]",
    "S+79.966": "S[Phospho]",
    "T+79.966": "T[Phospho]",
    "Y+79.966": "Y[Phospho]"
}

class MassAwarePeptideTokenizer(PeptideTokenizer):
    """A tokenizer inheriting from depthcharge.tokenizers.PeptideTokenizer and replaces any peptides that conflict in mass.

    Will pass all parameters to the depthcharge tokenizer but will remove any mass duplicates beforehand. Each conflict will be printed to the console.

    """
    def __init__(
        self,
        residues: Iterable[str] | None = None,
        replace_isoleucine_with_leucine: bool = False,
        reverse: bool = False,
        start_token: str | None = None,
        stop_token: str | None = "$",
        expanded_residues: Iterable[str] | None = None,
        calibration_factor: float = 1.0
    ) -> None:
        """Initialize a MassAwarePeptideTokenizer."""
        if not replace_isoleucine_with_leucine:
            print("[WARNING]: Replacement of isoleucine and leucine was set to false. This setting is not supported in the MassAwarePeptideTokenizer tokenizer. It will remove any mass conflicts.")
        self.residues = {} # PeptideTokenizer.residues.copy()
        self.expanded_residues = {}
        self.calibration_factor = calibration_factor

        if residues is not None:
            self.residues.update(residues)
        if expanded_residues is not None:
            self.residues.update(expanded_residues)
            self.expanded_residues = expanded_residues.copy()

        # Remove all confilcting AAs and store their replacements
        self.removed_aa = self.remove_mass_duplicates()

        super().__init__(
            residues=None,
            replace_isoleucine_with_leucine=False,
            reverse=reverse,
            start_token=start_token,
            stop_token=stop_token)

        self.replace_isoleucine_with_leucine = True

    def remove_mass_duplicates(self):
        # Inverse map from mass to one AA to keep
        mass_to_aa = {}
        
        # Map of conflicts -> AA_removed : AA_that_is_kept
        removed_aa = {}

        for aa, mass in self.residues.items():
            if mass in mass_to_aa:
                removed_aa[aa] = mass_to_aa[mass]
            else:
                mass_to_aa[mass] = aa
        
        for aa in removed_aa.keys():
            del self.residues[aa]

        for aa_replaced, aa_kept in removed_aa.items():
            print(f"{aa_replaced} will be replaced by {aa_kept} as they share the same mass")

            # Also delete the token if it is in the expanded residues
            if self.expanded_residues.pop(aa_replaced, None) is not None:
                print(f"WARNING: {aa_replaced} was in expanded residues (supposdedly not seen during trainng)")

        return removed_aa

    def split(self, sequence: str) -> list[str]:
        """
        Split a ProForma peptide sequence as in PeptideTokenizer from depthCharge => this time remove all conflcits.
        """
        pep = self._parse_peptide(sequence)

        pep = pep.split()
        if self.reverse:
            pep.reverse()

        # Replace any conflicting 
        for i, aa in enumerate(pep):
            if aa in self.removed_aa:
                pep[i] = self.removed_aa[aa]

        return pep

def from_advances_massivekb(cls, sequence: str):
    sequence = "".join(
            [
                MSKB_TO_UNIMOD.get(aa, aa)
                for aa in re.split(r"(?<=.)(?=[A-Z])", sequence)
            ]
        )

    return Peptide.from_proforma(sequence)

class MassAwareMskbPeptideTokenizer(MassAwarePeptideTokenizer):
    _parse_peptide = from_advances_massivekb
