# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.

from typing import Dict, List, Optional, Union

from .abstract_tokenizer import MegatronTokenizerTextAbstract


class ByteLevelTokenizer(MegatronTokenizerTextAbstract):
    """
    A byte-level tokenizer that encodes text as UTF-8 bytes with user control over the EOS, BOS,
        and PAD tokens as well as the vocabulary size and a mapping of other special tokens
        to their IDs.
    """

    def __init__(
        self,
        special_tokens: Optional[Union[Dict[str, str], List[str]]] = None,
        vocab_size: int = 512,
        _eos_id: int = 0,
        _pad_id: int = 1,
        _bos_id: int = None,
    ):
        """A byte-level tokenizer that encodes text as UTF-8 bytes.

        This tokenizer treats each byte as a token, with a default vocabulary size of 512
        to accommodate UTF-8 byte values (0-255) plus special tokens. It can handle
        arbitrary text input by encoding it into bytes.

        Args:
            special_tokens: Dictionary or list of special tokens to add to the vocabulary.
                These tokens will be assigned IDs at the end of the vocabulary.
                Defaults to None.
            vocab_size: Size of the vocabulary, should be at least 256 to handle all byte values.
                Special tokens will be added after this size.
                Defaults to 512.
            _eos_id: ID to use for the end-of-sequence token.
                Defaults to 0.
            _pad_id: ID to use for the padding token.
                Defaults to 1.
            _bos_id: ID to use for the beginning-of-sequence token.
                Defaults to None.
        """
        self._eos_id = _eos_id
        self._pad_id = _pad_id
        self._bos_id = _bos_id
        self.special_token_to_id = {
            self.pad_id: self.pad_id,
            self.bos_id: self.bos_id,
            self.eos_id: self.eos_id,
        }
        # Track special byte-tokens at end of vocabulary.
        self.vocab_size = vocab_size if special_tokens is None else vocab_size + len(special_tokens)
        self.special_start = self.vocab_size
        special_tokens = {} if special_tokens is None else special_tokens
        for tok in special_tokens:
            self.special_start -= 1
            self.special_token_to_id[tok] = self.special_start
        self.id_to_special_token = {v: k for k, v in self.special_token_to_id.items()}

    # no distinction between tokens and ids.
    def text_to_tokens(self, text):
        """
        Convert a text to a list of tokens.
        """
        return self.text_to_ids(text)

    def tokens_to_text(self, tokens):
        """
        Convert a list of tokens to a text.
        """
        return self.ids_to_text(tokens)

    def text_to_ids(self, text):
        """
        Convert a text to a list of IDs.
        """
        return list(text.encode('utf-8'))

    def ids_to_text(self, ids):
        """
        Convert a list of IDs to a text.
        """
        # remove special tokens.
        ids = [x for x in ids if x < self.special_start]
        return bytes(ids).decode('utf-8', errors='ignore').rstrip()

    def tokens_to_ids(self, tokens):
        """
        Convert a list of tokens to a list of IDs.
        """
        if isinstance(tokens, str):
            tokens = [tokens]
        ids = []
        for token in tokens:
            ids.append(self.token_to_id(token))
        return ids

    def ids_to_tokens(self, ids):
        """
        Convert a list of IDs to a list of tokens.
        """
        if isinstance(ids, int):
            ids = [ids]
        tokens = []
        for id in ids:
            tokens.append(self.id_to_token(id))
        return tokens

    def token_to_id(self, token):
        """
        Convert a token to its corresponding ID.
        """
        if token in self.special_token_to_id:
            return self.special_token_to_id[token]
        else:
            return token

    def id_to_token(self, id):
        """
        Convert an ID to its corresponding token.
        """
        if id < self.special_start:
            return id
        else:
            return self.id_to_special_token[id]

    def add_special_tokens(self, special_tokens: Union[list, dict]) -> None:
        """Adds special tokens to the tokenizer."""
        raise NotImplementedError("This method is not supported for byte-level tokenizers.")

    @property
    def pad_id(self):
        """
        Get the padding ID.
        """
        return self._pad_id

    @property
    def bos_id(self):
        """
        Get the beginning-of-sequence ID.
        """
        return self._bos_id

    @property
    def eos_id(self):
        """
        Get the end-of-sequence ID.
        """
        return self._eos_id

    @property
    def eod(self):
        """
        Get the end-of-document ID.
        """
        return self._eos_id

    @property
    def unk_id(self):
        """
        Get the unknown ID.
        """
        return 259
