import argparse
from pathlib import Path
import json
import numpy as np
from transformers import AutoTokenizer

def compute_stats(tokenizer, jsonl_path):
    token_counts = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            text = obj.get("text", "")
            tokens = tokenizer.encode(str(text), add_special_tokens=False)
            token_counts.append(len(tokens))
    if not token_counts:
        return None
    arr = np.array(token_counts)
    return {
        "count": len(token_counts),
        "mean": float(np.mean(arr)),
        "min": int(np.min(arr)),
        "max": int(np.max(arr)),
        "median": float(np.median(arr))
    }

def main():
    parser = argparse.ArgumentParser(description="Compute token stats for train_full.jsonl and train_val.jsonl in each dataset folder.")
    parser.add_argument("--data_dir", type=str, required=True, help="Path to folder with dataset subfolders")
    parser.add_argument("--tokenizer", type=str, required=True, help="HuggingFace tokenizer name (e.g. 'gpt2')")
    args = parser.parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
    data_dir = Path(args.data_dir)

    for ds_folder in sorted(data_dir.iterdir()):
        if not ds_folder.is_dir():
            continue
        for split_name in ["train_full.jsonl", "train_val.jsonl"]:
            jsonl_path = ds_folder / split_name
            if not jsonl_path.exists():
                continue
            stats = compute_stats(tokenizer, jsonl_path)
            if stats is None:
                print(f"{ds_folder.name}/{split_name}: No samples found.")
            else:
                print(f"{ds_folder.name}/{split_name}: count={stats['count']} mean={stats['mean']:.2f} min={stats['min']} max={stats['max']} median={stats['median']:.2f}")

if __name__ == "__main__":
    main()
