from __future__ import annotations

import os.path

import cysimdjson
import ipadic
import MeCab

from .base import Tokenizer


class TokenizerMecab(Tokenizer):
    tokenizer: MeCab.Tagger

    @property
    def unk_idx(self) -> int:
        """Return the unknown index."""
        return self.dictionary[self.UNK_TOKEN]

    @classmethod
    def _initialize(cls, mecab_args: str = ipadic.MECAB_ARGS) -> MeCab.Tagger:
        cls.tokenizer = MeCab.Tagger(f"-Owakati {mecab_args}")
        return cls.tokenizer

    @classmethod
    def build(cls, name_or_path: str) -> TokenizerMecab:
        """Build an tokenizer class.

        Args:
            name_or_path (str): Path to the directory contains a vocabulary file.

        Returns:
            Tokenizer: This class.
        """
        # For multiprocessing purpose.
        cls.tokenizer = cls._initialize()

        parser = cysimdjson.JSONParser()
        dictionary = parser.load(os.path.join(name_or_path, "vocab.json")).export()
        dictionary[cls.UNK_TOKEN] = max(dictionary.values()) + 1

        # Set tokenizer=None for multiprocessing purpose.
        return cls(None, dictionary)

    @classmethod
    def _tokenize(cls, line: str) -> list[str]:
        """Tokenize the input line.

        Args:
            line (str): An input line.

        Returns:
            list[str]: The tokenized line.
        """
        line = line.strip()
        return cls.tokenizer.parse(line).rstrip().split(" ")

    def tokenize(self, line: str) -> list[str]:
        """Tokenize the input line.

        Args:
            line (str): An input line.

        Returns:
            list[str]: The tokenized line.
        """
        return self._tokenize(line)
