import argparse
from pathlib import Path
import json
import numpy as np

def compute_similarity_stats(similarity_list):
    if not similarity_list:
        return None
    percents = []
    for sample in similarity_list:
        if not sample:
            continue
        ones = sum(1 for v in sample if v == 1.0)
        perc = ones / len(sample) * 100
        percents.append(perc)
    arr = np.array(percents)
    return {
        "min": float(np.min(arr)),
        "mean": float(np.mean(arr)),
        "median": float(np.median(arr)),
        "max": float(np.max(arr)),
        "count": len(arr)
    }

def process_metrics_file(metrics_path):
    with open(metrics_path, "r", encoding="utf-8") as f:
        metrics = json.load(f)
    similarity_list = metrics.get("similarity", [])
    return compute_similarity_stats(similarity_list)

def main():
    parser = argparse.ArgumentParser(description="Compute similarity==1.0 stats for each dataset and split.")
    parser.add_argument("--raw_values_dir", type=str, required=True, help="Path to raw_values/{model_name_safe}")
    args = parser.parse_args()

    model_dir = Path(args.raw_values_dir)
    for dataset_dir in sorted(model_dir.iterdir()):
        if not dataset_dir.is_dir():
            continue
        for split_file in ["train_metrics.json", "val_metrics.json"]:
            metrics_path = dataset_dir / split_file
            if not metrics_path.exists():
                continue
            stats = process_metrics_file(metrics_path)
            if stats is None:
                print(f"{dataset_dir.name}/{split_file}: No similarity data.")
            else:
                print(f"{dataset_dir.name}/{split_file}: min={stats['min']:.2f}% mean={stats['mean']:.2f}% median={stats['median']:.2f}% max={stats['max']:.2f}% count={stats['count']}")

if __name__ == "__main__":
    main()
