import os
import shutil
import filecmp
import logging
from ..tokenizer.tokenizer_registry import TOKENIZER_REGISTRY

class AutoTokenizer:
    @classmethod
    def from_pretrained(cls, tokenizer_type: str, source: str):
        if tokenizer_type not in TOKENIZER_REGISTRY:
            raise ValueError(f"[AutoTokenizer] Unknown tokenizer type: '{tokenizer_type}'")

        TokenizerClass = TOKENIZER_REGISTRY[tokenizer_type]
        return TokenizerClass(source)

def save_tokenizer(tokenizer_source: str, exp_dir: str):
    """
    Save the tokenizer to exp_dir/tokenizer.

    Behavior:
    - If tokenizer_source is not a directory → skip
    - If exp_dir/tokenizer does not exist → copy entire tokenizer_source
    - If exp_dir/tokenizer exists → check content match:
        - If same → do nothing
        - If different → raise error
    """
    target_dir = os.path.join(exp_dir, "tokenizer")

    if not os.path.isdir(tokenizer_source):
        logging.info(f"[Tokenizer] Skipping save: tokenizer_source is not a local directory ({tokenizer_source})")
        return

    if not os.path.exists(target_dir):
        shutil.copytree(tokenizer_source, target_dir)
        logging.info(f"[Tokenizer] Copied tokenizer directory from {tokenizer_source} → {target_dir}")
        return

    # Compare files between source and existing
    dircmp = filecmp.dircmp(tokenizer_source, target_dir)

    if dircmp.left_only or dircmp.right_only or dircmp.diff_files:
        logging.error(f"[Tokenizer] tokenizer directory already exists at {target_dir}, but differs from source: {tokenizer_source}")
        logging.error(f"  Different files: {dircmp.diff_files}")
        logging.error(f"  Only in source: {dircmp.left_only}")
        logging.error(f"  Only in target: {dircmp.right_only}")
        raise FileExistsError(f"[Tokenizer] Existing tokenizer at {target_dir} differs from source: {tokenizer_source}")

    logging.info(f"[Tokenizer] Tokenizer already exists and matches source. Skipping save.")
