import os
import json
import logging
import importlib
from omegaconf import OmegaConf, DictConfig
from transformers import AutoTokenizer

def build_tokenizer(cfg: DictConfig):
    """
    Build and return a tokenizer based on the provided configuration.

    Args:
        cfg (DictConfig): Tokenizer configuration.

    Expected `cfg` structure:
    cfg:
        name: "bert-base-uncased"  # Tokenizer name (for HF tokenizers)
        type: "AutoTokenizer"      # Hugging Face tokenizer type or local tokenizer class name
        local: false               # Whether to load from local or HF
        path: ./data/bbpe          # if local tokenizer, it is the path to the tokenizer dir or model path

    Returns:
        Tokenizer instance.
    """
    if cfg.local:
        return _load_local_tokenizer(cfg)
    else:
        return _load_huggingface_tokenizer(cfg)

def _load_huggingface_tokenizer(cfg):
    """Dynamically load a Hugging Face tokenizer using its type."""
    tokenizer_class = getattr(importlib.import_module("transformers"), cfg.type)
    logging.info(f"Loading Hugging Face tokenizer: {cfg.name} using {cfg.type}")

    # Load tokenizer from Hugging Face Hub
    return tokenizer_class.from_pretrained(cfg.name)

def _load_local_tokenizer(cfg):
    """Load a custom tokenizer from local project (auden.tokenizer)."""
    module_name = f"auden.tokenizer.{cfg.name}"
    logging.info(f"Importing local tokenizer: {module_name}.{cfg.type}")

    # Dynamically import the tokenizer class
    module = importlib.import_module(module_name)
    tokenizer_class = getattr(module, cfg.type)

    return tokenizer_class(cfg.path)
