from lmkit.sparse.fragment_mapper import compile_smarts, find_fragments_and_tokens
from lmkit.tools.compat import load_tokenizer

tok = load_tokenizer(
    tokenizer_path="models/transformer_sm/tokenizer.json",
    generation_config_file="models/transformer_sm/generation_config.json",
    trunc_length=None,
)

queries = compile_smarts(
    {
        "phenyl": "c1ccccc1",
        "ester_strict": "[CX3](=O)[OX2H0][#6]",
        "carboxylic_acid": "[CX3](=O)[OX2H1]",
    }
)

smi = "CC(=O)OC1=CC=CC=C1C(=O)O"
res = find_fragments_and_tokens(smi, queries, tokenizer=tok)

for name, occs in res["fragments"].items():
    for occ in occs:
        for seg in occ["original"]["segments"]:
            print(name, seg["text"])
            print("  idxs:", seg["token_indices"])
            print("  ids :", seg["token_ids"])
            print("  enc :", seg["token_texts_from_encoding"])
            print("  dec :", seg["token_texts_decoded"])
            print("  match?", seg["decoded_match"], "mismatch_at:", seg["mismatch_at"])
            # If your model input includes BOS at position 0, shift for alignment:
            # aligned = [i + 1 for i in seg["token_indices"]]
