# fix_align_direct.py
import os
import shutil
from huggingface_hub import hf_hub_download
from transformers import XLMRobertaTokenizerFast, AutoConfig

# Force Online
if "HF_HUB_OFFLINE" in os.environ:
    del os.environ["HF_HUB_OFFLINE"]

def fix_align_direct():
    folder_name = "xlm-roberta-large"
    repo_id = "xlm-roberta-large"
    cwd = os.getcwd()
    
    print(f"===========================================================")
    print(f"DIRECT INSTANTIATION FIX FOR ALIGN")
    print(f"Target: {os.path.join(cwd, folder_name)}")
    print(f"===========================================================\n")

    # 1. Clean and Create Folder
    if os.path.exists(folder_name):
        shutil.rmtree(folder_name)
    os.makedirs(folder_name, exist_ok=True)

    # 2. Download Essential Files
    print("Downloading raw files...")
    # We use local_dir to put them exactly where we want them
    vocab_path = hf_hub_download(repo_id, "sentencepiece.bpe.model", local_dir=folder_name, local_dir_use_symlinks=False)
    json_path = hf_hub_download(repo_id, "tokenizer.json", local_dir=folder_name, local_dir_use_symlinks=False)
    hf_hub_download(repo_id, "config.json", local_dir=folder_name, local_dir_use_symlinks=False)

    print("Files downloaded.")

    # 3. Instantiate DIRECTLY (Bypassing from_pretrained bug)
    print("Instantiating Tokenizer via direct paths...")
    # We pass the absolute paths to the specific files. 
    # This skips the logic that tries (and fails) to parse config.json automatically.
    tokenizer = XLMRobertaTokenizerFast(
        vocab_file=vocab_path,
        tokenizer_file=json_path
    )

    # 4. Save Normalized Version
    # Now that we have the object in memory, we save it back. 
    # This writes the correct tokenizer_config.json that OpenCLIP will need.
    print("Saving normalized tokenizer...")
    tokenizer.save_pretrained(folder_name)
    
    # 5. Handle Config separately
    print("Ensuring config is valid...")
    config = AutoConfig.from_pretrained(f"./{folder_name}")
    config.save_pretrained(folder_name)

    print("\n===========================================================")
    print("SUCCESS.")
    print("The ALIGN tokenizer is repaired.")
    print("===========================================================")

if __name__ == "__main__":
    fix_align_direct()