from __future__ import annotations

import os.path

import cysimdjson
import icu_tokenizer

from .base import Tokenizer


class TokenizerICU(Tokenizer):
    tokenizer: icu_tokenizer.Tokenizer

    @property
    def unk_idx(self) -> int:
        """Return the unknown index."""
        return self.dictionary[self.UNK_TOKEN]

    @classmethod
    def _initialize(cls, lang: str = "en") -> icu_tokenizer.Tokenizer:
        cls.tokenizer = icu_tokenizer.Tokenizer(lang=lang)
        return cls.tokenizer

    @classmethod
    def build(cls, name_or_path: str, lang: str = "en") -> TokenizerICU:
        """Build an tokenizer class.

        Args:
            name_or_path (str): Path to the directory contains a vocabulary file.
            lang (str): Language code.

        Returns:
            Tokenizer: This class.
        """
        # For multiprocessing purpose.
        cls.tokenizer = cls._initialize(lang)

        parser = cysimdjson.JSONParser()
        dictionary = parser.load(os.path.join(name_or_path, "vocab.json")).export()
        dictionary[cls.UNK_TOKEN] = max(dictionary.values()) + 1

        # Set tokenizer=None for multiprocessing purpose.
        return cls(None, dictionary)

    @classmethod
    def _tokenize(cls, line: str) -> list[str]:
        """Tokenize the input line.

        Args:
            line (str): An input line.

        Returns:
            list[str]: The tokenized line.
        """
        line = line.strip()
        line = line.lower()
        return cls.tokenizer.tokenize(line)

    def tokenize(self, line: str) -> list[str]:
        """Tokenize the input line.

        Args:
            line (str): An input line.

        Returns:
            list[str]: The tokenized line.
        """
        return self._tokenize(line)
