import json
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import re
from tqdm import tqdm
from transformers import AutoTokenizer
import random
import shutil
import time

MODEL_NAME = "/root/user/models/Qwen3-Coder-30B-A3B-Instruct"

def load_jsonl(in_file):
    datas = []
    with open(in_file, "r", encoding="utf-8") as f:
        for line in tqdm(f):
            datas.append(json.loads(line))
    return datas


def save_jsonl(datas, out_file):
    with open(out_file, "w", encoding="utf-8") as f:
        for data in tqdm(datas):
            f.write(json.dumps(data) + "\n")


def check_apply_chat_template(trajectories):
    samples = random.sample(trajectories, min(5000, len(trajectories)))
    output_dir = "src/run_process_data/tests"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    max_len = 0
    for idx, sample in tqdm(enumerate(samples)):
        if sample["kind"] == "uncompressed_messages":
            continue
        formatted_ids = tokenizer.apply_chat_template(sample["messages"], tokenize=True, tools=sample["tools"])
        length = len(formatted_ids)
        print(length)
        if length > max_len:
            max_len = length
    print(f"\n\nmax_len: {max_len}")


if __name__ == "__main__":
    in_file = "src/run_process_data/jsonl_files/nextjs_github-repos_filtered-with-info_backtranslated.jsonl"
    trajectories = load_jsonl(in_file)
    check_apply_chat_template(trajectories)