
class BioTokenizer:
    def __init__(self):
        self.alphabet_protein = 'ACDEFGHIKLMNPQRSTVWY' # [X] for unknown token
        self.residue_to_index = {res: idx for idx, res in enumerate(self.alphabet_protein)}
        self.index_to_residue = {idx: res for res, idx in self.residue_to_index.items()}
        self.alphabet_RNA = 'AUGC'
    
    def encode(self, seq, RNA=False):
        def safe_indexing(alphabet, s):
            if s in alphabet:
                return alphabet.index(s)
            else:
                return 20 if not RNA else 4
        if RNA:
            return [safe_indexing(self.alphabet_RNA, s) for s in seq]
        else:
            return [safe_indexing(self.alphabet_protein, s) for s in seq]
        
    def decode(self, indices_list, RNA=False):
        def safe_decoding(alphabet, idx):
            if 0 <= idx < len(alphabet):
                return alphabet[idx]
            else:
                return 'X' if not RNA else 'N'  # 'X' for unknown protein residues, 'N' for unknown RNA bases

        # 遍历输入的序列列表，每个序列解码为字符串
        decoded_sequences = []
        for indices in indices_list:
            if RNA:
                decoded_sequences.append(''.join([safe_decoding(self.alphabet_RNA, idx) for idx in indices]))
            else:
                decoded_sequences.append(''.join([safe_decoding(self.alphabet_protein, idx) for idx in indices]))
        return decoded_sequences