from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import sys
import os
from datasets import load_dataset

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
model_name = "~/overthinking/code/distill_data/ckps/r1-gsmtrain-budget"

tokenizer = AutoTokenizer.from_pretrained(model_name)

data_path = '~/datasets/STILL-3-Preview-RL-Data-HF/data/train_budget.parquet'
dataset = load_dataset('parquet', data_files=data_path, split='train')

max_len = 0
max_prompt = ""
for item in dataset:
    token_len = len(tokenizer.encode(item['prompt'][0]['content']))
    if token_len > max_len:
        max_len = token_len
        max_prompt = item['prompt']

print(max_prompt)
print(max_len)
