"""
Debug tokenizer decoder configuration to see why special tokens are missing.
"""

from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tokenizer_32k.json")

print("="*80)
print("Tokenizer Decoder Configuration")
print("="*80)
print(f"Decoder: {tokenizer.decoder}")
print()

# Try decoding with individual tokens
test_ids = [1, 641, 22, 6401, 2, 641, 21, 703, 206, 694, 0]
print(f"Token IDs: {test_ids}")
print()

print("Individual token decoding:")
for tid in test_ids:
    token = tokenizer.id_to_token(tid)
    decoded = tokenizer.decode([tid])
    print(f"  ID {tid:4d} -> token: '{token:20s}' -> decoded: '{decoded}'")

print()
print("="*80)
print("Full sequence decode:")
print("="*80)
full_decoded = tokenizer.decode(test_ids)
print(f"Result: '{full_decoded}'")
print()

# Check if there's a way to decode without skipping special tokens
print("="*80)
print("Manual reconstruction from vocab:")
print("="*80)
manual = ""
for tid in test_ids:
    token = tokenizer.id_to_token(tid)
    manual += token
print(f"Result: '{manual}'")
