import os
import pandas as pd
import glob
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm

def analyze_algorithm_files(data_dir="./data/sft"):
    csv_files = sorted(glob.glob(f"{data_dir}/algorithm_*.csv"))
    
    liquid_tokenizer = AutoTokenizer.from_pretrained("LiquidAI/LFM2-350M")
    smol_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct")
    
    results = []
    
    for csv_file in tqdm(csv_files, desc="Processing algorithms"):
        algo = os.path.basename(csv_file).replace('algorithm_', '').replace('.csv', '')
        df = pd.read_csv(csv_file)
        texts = df['text'].tolist()
        
        for name, tokenizer in [('Liquid', liquid_tokenizer), ('SmolLM', smol_tokenizer)]:
            token_counts = []
            
            for text in tqdm(texts, desc=f"{algo} {name}", leave=False):
                tokens = tokenizer.encode(text)
                token_counts.append(len(tokens))
            
            results.append({
                'algorithm': algo,
                'tokenizer': name,
                'min': min(token_counts),
                'max': max(token_counts),
                'avg': np.mean(token_counts),
                'std': np.std(token_counts)
            })
            
            print(f"{algo:<35} {name:<8} {min(token_counts):<6} {max(token_counts):<6} {np.mean(token_counts):<8.1f}")
    
    df_results = pd.DataFrame(results)
    df_results.to_csv('token_stats.csv', index=False)
    print("Results saved to token_stats.csv")

if __name__ == "__main__":
    analyze_algorithm_files()