
import os
import json
from transformers import AutoTokenizer

import re

tokenizer = AutoTokenizer.from_pretrained("user/models/Qwen2.5-32B-Instruct")

def get_token_count(tokenizer, text):
    return len(tokenizer.encode(text))



file = "user/user1/lm-evaluation-harness/pivotal_step/results/reasoning_baseline/aime/dpsk-distill-qwen2.5-32b_baseline.jsonl"


REFLECTION_WORDS = [
    "Wait",
    "Alternatively",
    "Actually",
    "On second thought",
    "Alternatively",
    "However",
    "That can't be right",
    "Hold on",
    "Let's reconsider",
    "I realize",
    "It might be better",
    "Let's double-check",
    "Upon reconsideration",
    "Wait a moment",
    "Wait, that's wrong",
    "Another approach",
    "Thinking again",
    "Let me check",
]



def split_by_reflection(text):
    pattern = '|'.join(map(re.escape, REFLECTION_WORDS))
    return [p.strip() for p in re.split(f'({pattern})', text) if p.strip()]


text = "xxxxxxxxx"

parts = split_by_reflection(text)
print(len(parts))
for p in parts:
    print(get_token_count(tokenizer, p))

# with open(file, "r") as f:
#     for line in f:
#         item = json.loads(line)
#         print('problem_id:', item["problem_id"])
#         print('prefix_token:', get_token_count(tokenizer, item["prefix"]))


