import logging
import os
from pathlib import Path

import numpy as np
from transformers import GPTNeoXTokenizerFast, AutoTokenizer

logger = logging.getLogger(__name__)


def get_tokenizer(name: str, tokenizer_dir: str, **kwargs):
    tokenizer_dir = Path(tokenizer_dir)
    if not os.path.exists(tokenizer_dir):
        os.makedirs(tokenizer_dir)
    tokenizer_dir = tokenizer_dir / name

    assert name == "neox"  # No other tokenizers are supported

    if os.path.exists(tokenizer_dir / "tokenizer_config.json"):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
    else:
        tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")
        os.makedirs(tokenizer_dir)
        tokenizer.save_pretrained(tokenizer_dir)

    if not tokenizer.eos_token:
        tokenizer.add_special_tokens({'eos_token': "<eos>"})

    if not hasattr(tokenizer, "vocab_size_128"):
        setattr(tokenizer, "vocab_size_128", int(np.ceil(len(tokenizer) / 128)) * 128)

    if not hasattr(tokenizer, "vocab_size_8"):
        setattr(tokenizer, "vocab_size_8", int(np.ceil(len(tokenizer) / 8)) * 8)

    return tokenizer
