# pip install datasets pandas
from datasets import load_dataset
import pandas as pd

# Load the canonical LiveBench leaderboard table
ds = load_dataset("livebench/model_judgment", split="leaderboard")
df = ds.to_pandas()

# Map your display names to LiveBench model IDs (adjust if needed)
wanted = {
    "Claude-3 Haiku": "claude-3-haiku-20240307",
    "Claude-3 Sonnet": "claude-3-sonnet-20240229",
    "Claude-3 Opus": "claude-3-opus-20240229",
    "Claude-3.5 Sonnet": "claude-3-5-sonnet-20240620",
    "Cohere Command R": "command-r",
    "Cohere Command R+": "command-r-plus",
    "Gemini 1.0 Pro": "gemini-1.0-pro-instruct",
    "Gemini 1.5 Flash": "gemini-1.5-flash-api-0514",
    "Gemini 1.5 Pro": "gemini-1.5-pro-api-0514",
    "GPT-3.5-turbo": "gpt-3.5-turbo-0125",
    "GPT-4-turbo": "gpt-4-turbo-2024-04-09",
    "GPT-4o": "gpt-4o-2024-05-13",
    "Llama 3 8B Instruct": "meta-llama-3-8b-instruct",
    "Llama 3 70B Instruct": "meta-llama-3-70b-instruct",
    "Mistral Large": "mistral-large-2402",
    "Mixtral 8x22B MoE": "mixtral-8x22b-instruct-v0.1",
    "o1-mini": "o1-mini-2024-09-12",
    "o1-preview": "o1-2024-12-17-high",   # sometimes also listed as chatgpt o1-preview variants
}

subset = df[df["model"].isin(wanted.values())]

# Per-model, per-category mean score (0..1) → percentage
by_cat = (subset.groupby(["model","category"])["score"]
                .mean()
                .mul(100)
                .round(1)
                .reset_index())

# Add overall (average across all categories/tasks)
overall = (subset.groupby("model")["score"]
                 .mean()
                 .mul(100)
                 .round(1)
                 .rename("Overall")
                 .reset_index())

# Pivot to a wide table
wide = by_cat.pivot(index="model", columns="category", values="score").reset_index()
out = overall.merge(wide, on="model", how="left")

# Reattach your display names
inv = {v:k for k,v in wanted.items()}
out.insert(0, "Model (your name)", out["model"].map(inv))
out = out.sort_values("Model (your name)")

print(out.fillna("—"))
