from __future__ import annotations

from . import register
from .base import Tokenizer


@register("transformers")
class TokenizerTransformers(Tokenizer):
    """Tokenizer class."""

    @property
    def unk_idx(self) -> int:
        """Return the unknown index."""
        return self.tokenizer.pad_token_id

    @classmethod
    def build(cls, name_or_path: str, lang: str = "en") -> TokenizerTransformers:
        """Build an tokenizer class.

        Args:
            name_or_path (str): Model name or path.

        Returns:
            TokenizerTransformers: This class.
        """
        from transformers import AutoTokenizer
        from transformers.tokenization_utils import PreTrainedTokenizer

        tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(name_or_path)
        return cls(tokenizer, tokenizer.get_vocab())

    def tokenize(self, line: str) -> list[str]:
        """Tokenize the input line.

        Args:
            line (str): An input line.

        Returns:
            list[str]: The tokenized line.
        """
        return self.tokenizer.tokenize(line)
