import datetime
import json
import os
from pathlib import Path

from ale_bench.data import Result, load_problem
from tqdm.auto import tqdm

PROBLEM_IDS = open(Path(__file__).resolve().parents[1] / "problem_ids.txt").read().splitlines()
PROBLEM_IDS_LITE = open(Path(__file__).resolve().parents[1] / "problem_ids_lite.txt").read().splitlines()

MINIMUM_PERFORMANCES = {
    "ahc001": -50, "ahc002": -65, "ahc003": 234, "ahc004": -21, "ahc005": 207,
    "ahc006": 116, "ahc007": -61, "ahc008": 247, "ahc009": 42, "ahc010": -110,
    "ahc011": -80, "ahc012": 123, "ahc014": 111, "ahc015": -1, "ahc016": 56,
    "ahc017": 74, "ahc019": 51, "ahc020": -70, "ahc021": -102, "ahc024": 277,
    "ahc025": 467, "ahc026": 223, "ahc027": 103, "ahc028": -42, "ahc030": 194,
    "ahc031": 117, "ahc032": -155, "ahc033": 132, "ahc034": 201, "ahc035": -200,
    "ahc038": 94, "ahc039": 265, "ahc040": 145, "ahc041": 75, "ahc042": 50,
    "ahc044": -80, "ahc045": 117, "ahc046": 116, "future-contest-2022-qual": 178, "toyota2023summer-final": 483,
}

CONTEST_END_DATES = {contest: datetime.datetime.strptime(date, "%Y-%m-%d") for contest, date in {
    "ahc001": "2021-03-14", "ahc002": "2021-04-25", "ahc003": "2021-05-30", "ahc004": "2021-06-26", "ahc005": "2021-08-07",
    "ahc006": "2021-11-14", "ahc007": "2021-12-12", "ahc008": "2022-02-26", "ahc009": "2022-03-26", "ahc010": "2022-04-24",
    "ahc011": "2022-06-05", "ahc012": "2022-07-03", "ahc013": "2022-08-16", "ahc014": "2022-10-01", "ahc015": "2022-10-30",
    "ahc016": "2022-11-20", "ahc017": "2023-02-05", "ahc018": "2023-02-26", "ahc019": "2023-04-02", "ahc020": "2023-06-11",
    "ahc021": "2023-06-25", "ahc022": "2023-08-20", "ahc023": "2023-09-10", "ahc024": "2023-09-24", "ahc025": "2023-10-22",
    "ahc026": "2023-11-05", "ahc027": "2023-12-10", "ahc028": "2024-01-13", "ahc029": "2023-12-26", "ahc030": "2024-02-19",
    "ahc031": "2024-04-01", "ahc032": "2024-04-07", "ahc033": "2024-05-27", "ahc034": "2024-06-16", "ahc035": "2024-07-21",
    "ahc036": "2024-09-02", "ahc037": "2024-09-15", "ahc038": "2024-10-14", "ahc039": "2024-11-10", "ahc040": "2024-12-09",
    "ahc041": "2025-01-19", "ahc042": "2025-02-02", "ahc043": "2025-02-24", "ahc044": "2025-03-16", "ahc045": "2025-04-07",
    "ahc046": "2025-04-26", "rcl-contest-2021-long": "2021-09-12", "future-contest-2022-qual": "2021-11-13", "toyota2023summer-final": "2023-08-27",
}.items()}

LONG_PROBLEM_IDS = {
    "ahc001", "ahc003", "future-contest-2022-qual", "ahc008", "ahc011",
    "ahc014", "ahc016", "ahc017", "ahc019", "ahc025", "ahc027",
    "ahc030", "ahc031", "ahc033", "ahc038", "ahc040", "ahc045",
}

KNOWLEDGE_CUTOFFS = {model: datetime.datetime.strptime(date, "%Y-%m-%d") for model, date in ({
    "gpt-4o-mini": "2023-10-01", "gpt-4o": "2023-10-01",
    "gpt-4.1-nano": "2024-06-01", "gpt-4.1-mini": "2024-06-01", "gpt-4.1": "2024-06-01",
    "o1-mini": "2023-10-01", "o1": "2023-10-01", "o1-pro": "2023-10-01",
    "o3-mini": "2023-10-01","o3-high": "2024-06-01", "o4-mini-high": "2024-06-01",
} | {
    model + "-high": value
    for model, value in {"o1-mini": "2023-10-01", "o1": "2023-10-01", "o1-pro": "2023-10-01",
    "o3-mini": "2023-10-01","o3-high": "2024-06-01", "o4-mini-high": "2024-06-01"}.items()
} | {
    "gemini-2.5-pro": "2025-02-01", "gemini-2.5-pro-thinking": "2025-02-01",
    "gemini-2.5-flash": "2025-02-01", "gemini-2.5-flash-thinking": "2025-02-01",
} | {
    "claude-3.7-sonnet": "2024-11-01", "claude-3.7-sonnet-thinking": "2024-05-01",
} | {
    "deepseek-v3": "2025-03-24", "deepseek-r1": "2025-01-20",
}).items()}

OPENAI_MODEL_SPECS = {
    # Normal models
    "gpt-4o-mini-2024-07-18": {
        "prompt_tokens": 0.15 / 1e6,
        "completion_tokens": 0.60 / 1e6,
        "context_window": 128000,
        "max_output_tokens": 16384,
        "knowledge_cutoff": "2023-10-01",
    },
    "gpt-4o-2024-05-13": {
        "prompt_tokens": 5.00 / 1e6,
        "completion_tokens": 15.00 / 1e6,
        "context_window": 128000,
        "max_output_tokens": 4096,
        "knowledge_cutoff": "2023-10-01",
    },
    "gpt-4o-2024-08-06": {
        "prompt_tokens": 2.50 / 1e6,
        "completion_tokens": 10.00 / 1e6,
        "context_window": 128000,
        "max_output_tokens": 16384,
        "knowledge_cutoff": "2023-10-01",
    },
    "gpt-4o-2024-11-20": {
        "prompt_tokens": 2.50 / 1e6,
        "context_window": 128000,
        "max_output_tokens": 16384,
        "knowledge_cutoff": "2023-10-01",
    },
    "gpt-4.1-nano-2025-04-14": {
        "prompt_tokens": 0.10 / 1e6,
        "completion_tokens": 0.40 / 1e6,
        "context_window": 1047576,
        "max_output_tokens": 32768,
        "knowledge_cutoff": "2024-06-01",
    },
    "gpt-4.1-mini-2025-04-14": {
        "prompt_tokens": 0.40 / 1e6,
        "completion_tokens": 1.60 / 1e6,
        "context_window": 1047576,
        "max_output_tokens": 32768,
        "knowledge_cutoff": "2024-06-01",
    },
    "gpt-4.1-2025-04-14": {
        "prompt_tokens": 2.00 / 1e6,
        "completion_tokens": 8.00 / 1e6,
        "context_window": 1047576,
        "max_output_tokens": 32768,
        "knowledge_cutoff": "2024-06-01",
    },
    # Reasoning models
    "o1-mini-2024-09-12": {
        "prompt_tokens": 1.10 / 1e6,
        "completion_tokens": 4.40 / 1e6,
        "context_window": 128000,
        "max_output_tokens": 65536,
        "knowledge_cutoff": "2023-10-01",
    },
    "o1-2024-12-17": {
        "prompt_tokens": 15.00 / 1e6,
        "completion_tokens": 60.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 100000,
        "knowledge_cutoff": "2023-10-01",
    },
    "o3-mini-2025-01-31": {
        "prompt_tokens": 1.10 / 1e6,
        "completion_tokens": 4.40 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 100000,
        "knowledge_cutoff": "2023-10-01",
    },
    "o1-pro-2025-03-19": {
        "prompt_tokens": 150.00 / 1e6,
        "completion_tokens": 600.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 100000,
        "knowledge_cutoff": "2023-10-01",
    },
    "o3-2025-04-16": {
        "prompt_tokens": 10.00 / 1e6,
        "completion_tokens": 40.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 100000,
        "knowledge_cutoff": "2024-06-01",
    },
    "o4-mini-2025-04-16": {
        "prompt_tokens": 1.10 / 1e6,
        "completion_tokens": 4.40 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 100000,
        "knowledge_cutoff": "2024-06-01",
    },
}


def calculate_costs_openai(response_result: dict, model: str) -> tuple[float, dict]:
    usage = response_result["usage"]
    return (
        usage["prompt_tokens"] * OPENAI_MODEL_SPECS[model]["prompt_tokens"] +
        usage["completion_tokens"] * OPENAI_MODEL_SPECS[model]["completion_tokens"],
        usage
    )


GOOGLE_GENAI_MODEL_SPECS = {
    "gemini-1.5-flash-8b-exp-0827": {
        "prompt_token": [(0, 128000, 0.0375 / 1e6), (128000, 1048576, 0.075 / 1e6)],
        "candidates_token": [(0, 128000, 0.15 / 1e6), (128000, 1048576, 0.30 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "",
    },
    "gemini-1.5-flash-8b-001": {
        "prompt_token": [(0, 128000, 0.0375 / 1e6), (128000, 1048576, 0.075 / 1e6)],
        "candidates_token": [(0, 128000, 0.15 / 1e6), (128000, 1048576, 0.30 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "",
    },
    "gemini-1.5-flash-001": {
        "prompt_token": [(0, 128000, 0.075 / 1e6), (128000, 1048576, 0.15 / 1e6)],
        "candidates_token": [(0, 128000, 0.30 / 1e6), (128000, 1048576, 0.60 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "",
    },
    "gemini-1.5-flash-002": {
        "prompt_token": [(0, 128000, 0.075 / 1e6), (128000, 1048576, 0.15 / 1e6)],
        "candidates_token": [(0, 128000, 0.30 / 1e6), (128000, 1048576, 0.60 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "",
    },
    "gemini-1.5-pro-001": {
        "prompt_token": [(0, 128000, 1.25 / 1e6), (128000, 2097152, 2.50 / 1e6)],
        "candidates_token": [(0, 128000, 5.00 / 1e6), (128000, 2097152, 10.00 / 1e6)],
        "context_window": 2097152,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "",
    },
    "gemini-1.5-pro-002": {
        "prompt_token": [(0, 128000, 1.25 / 1e6), (128000, 2097152, 2.50 / 1e6)],
        "candidates_token": [(0, 128000, 5.00 / 1e6), (128000, 2097152, 10.00 / 1e6)],
        "context_window": 2097152,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "",
    },
    "gemini-2.0-flash-lite-001": {
        "prompt_token": [(0, 1048576, 0.075 / 1e6)],
        "candidates_token": [(0, 1048576, 0.30 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "2024-09-01",
    },
    "gemini-2.0-flash-001": {
        "prompt_token": [(0, 1048576, 0.10 / 1e6)],
        "candidates_token": [(0, 1048576, 0.40 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "2024-09-01",
    },
    "gemini-2.5-pro-preview-03-25": {
        "prompt_token": [(0, 200000, 1.25 / 1e6), (200000, 1048576, 2.50 / 1e6)],
        "candidates_token": [(0, 200000, 10.00 / 1e6, 10.00 / 1e6), (200000, 1048576, 15.00 / 1e6, 15.00 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 65536,
        "knowledge_cutoff": "2025-02-01",
    },
    "gemini-2.5-flash-preview-04-17": {
        "prompt_token": [(0, 1048576, 0.15 / 1e6)],
        "candidates_token": [(0, 1048576, 0.60 / 1e6, 3.50 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 65536,
        "knowledge_cutoff": "2025-02-01",
    },
    "models/gemini-2.5-flash-preview-04-17": {
        "prompt_token": [(0, 1048576, 0.15 / 1e6)],
        "candidates_token": [(0, 1048576, 0.60 / 1e6, 3.50 / 1e6)],
        "context_window": 1048576,
        "max_output_tokens": 65536,
        "knowledge_cutoff": "2025-02-01",
    },
}


def calculate_costs_google_genai(response_result: dict, model: str) -> tuple[float, dict]:
    usage = response_result["usage_metadata"]
    prompt_token_count = usage["prompt_token_count"]
    candidates_token_count = usage["candidates_token_count"]
    thoughts_token_count = usage["thoughts_token_count"]
    cost = 0
    if candidates_token_count is None:
        print(usage)
        candidates_token_count = 0
    for start_end_price in GOOGLE_GENAI_MODEL_SPECS[model]["prompt_token"]:
        if prompt_token_count > start_end_price[0]:
            cost += (min(start_end_price[1], prompt_token_count) - start_end_price[0]) * start_end_price[2]
    for start_end_prices in GOOGLE_GENAI_MODEL_SPECS[model]["candidates_token"]:
        if prompt_token_count <= start_end_prices[1]:
            if thoughts_token_count is not None and thoughts_token_count > 0:
                cost += candidates_token_count * start_end_prices[3]
            else:
                cost += candidates_token_count * start_end_prices[2]
            break
    return cost, usage


ANTHROPIC_MODEL_SPECS = {
    "claude-3-5-sonnet-20240620": {
        "input_tokens": 3.00 / 1e6,
        "output_tokens": 15.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "2024-05-01",
    },
    "claude-3-5-haiku-20241022": {
        "input_tokens": 0.80 / 1e6,
        "output_tokens": 4.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "2024-07-01",
    },
    "claude-3-5-sonnet-20241022": {
        "input_tokens": 3.00 / 1e6,
        "output_tokens": 15.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 8192,
        "knowledge_cutoff": "2024-05-01",
    },
    "claude-3-7-sonnet-20250219": {
        "input_tokens": 3.00 / 1e6,
        "output_tokens": 15.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 64000,
        "knowledge_cutoff": "2024-11-01",
    },
    "anthropic/claude-3.7-sonnet": {
        "input_tokens": 3.00 / 1e6,
        "output_tokens": 15.00 / 1e6,
        "context_window": 200000,
        "max_output_tokens": 64000,
        "knowledge_cutoff": "2024-11-01",
    },
}


def calculate_costs_anthropic(response_result: dict, model: str) -> tuple[float, dict]:
    usage = response_result["usage"]
    input_tokens = usage.get("input_tokens", 0) + usage.get("prompt_tokens", 0)
    output_tokens = usage.get("output_tokens", 0) + usage.get("completion_tokens", 0)
    return (
        input_tokens * ANTHROPIC_MODEL_SPECS[model]["input_tokens"] +
        output_tokens * ANTHROPIC_MODEL_SPECS[model]["output_tokens"],
        usage
    )


DEEPSEEK_MODEL_SPECS = {
    "deepseek-chat": {
        "prompt_tokens": 0.27 / 1e6,
        "completion_tokens": 1.10 / 1e6,
        "context_window": 64000,
        "max_output_tokens": 8000,
        "knowledge_cutoff": "2025-03-24",
    },
    "deepseek-reasoner": {
        "prompt_tokens": 0.55 / 1e6,
        "completion_tokens": 2.19 / 1e6,
        "context_window": 64000,
        "max_output_tokens": 8000,
        "knowledge_cutoff": "2025-01-20",
    },
    ("deepseek/deepseek-chat-v3-0324", "Lambda"): {
        "prompt_tokens": 0.34 / 1e6,
        "completion_tokens": 0.88 / 1e6,
        "context_window": 163840,
        "max_output_tokens": 163840,
        "knowledge_cutoff": "2025-03-24",
    },
    ("deepseek/deepseek-chat-v3-0324", "DeepInfra"): {
        "prompt_tokens": 0.30 / 1e6,
        "completion_tokens": 0.88 / 1e6,
        "context_window": 163840,
        "max_output_tokens": 163840,
        "knowledge_cutoff": "2025-03-24",
    },
    ("deepseek/deepseek-r1", "Lambda"): {
        "prompt_tokens": 0.54 / 1e6,
        "completion_tokens": 2.18 / 1e6,
        "context_window": 163840,
        "max_output_tokens": 163840,
        "knowledge_cutoff": "2025-01-20",
    },
    ("deepseek/deepseek-r1", "DeepInfra"): {
        "prompt_tokens": 0.50 / 1e6,
        "completion_tokens": 2.18 / 1e6,
        "context_window": 163840,
        "max_output_tokens": 163840,
        "knowledge_cutoff": "2025-01-20",
    },
}


def calculate_costs_deepseek(response_result: dict, model: str, provider: str | None = None) -> tuple[float, dict]:
    usage = response_result["usage"]
    if provider is None:
        return (
            usage["prompt_tokens"] * DEEPSEEK_MODEL_SPECS[model]["prompt_tokens"] +
            usage["completion_tokens"] * DEEPSEEK_MODEL_SPECS[model]["completion_tokens"],
            usage
        )
    return (
        usage["prompt_tokens"] * DEEPSEEK_MODEL_SPECS[(model, provider)]["prompt_tokens"] +
        usage["completion_tokens"] * DEEPSEEK_MODEL_SPECS[(model, provider)]["completion_tokens"],
        usage
    )


def add_dict(dict1: dict, dict2: dict) -> dict:
    result = {}
    keys_union = dict1.keys() | dict2.keys()
    for key in keys_union:
        if key in dict1 and key in dict2:
            if dict1[key] is None:
                result[key] = dict2[key]
                continue
            if dict2[key] is None:
                result[key] = dict1[key]
                continue
            assert type(dict1[key]) == type(dict2[key]), f"Type mismatch for key {key}: {type(dict1[key])} vs {type(dict2[key])}"
            if isinstance(dict1[key], dict):
                result[key] = add_dict(dict1[key], dict2[key])
            else:
                result[key] = dict1[key] + dict2[key]
        elif key in dict1:
            result[key] = dict1[key]
        else:
            result[key] = dict2[key]
    return result


def aggregate_usage(llm_response_dir: Path, turn1: bool) -> tuple[int, float, dict]:
    response_count = 0
    total_cost = 0
    total_usage = {}
    for response_file in sorted(llm_response_dir.glob("response_*.json")):
        response = json.load(response_file.open("r"))
        if response.get("model", None) is not None:
            if response["model"] in OPENAI_MODEL_SPECS:
                cost, usage = calculate_costs_openai(response, response["model"])
            elif response["model"] in ANTHROPIC_MODEL_SPECS:
                cost, usage = calculate_costs_anthropic(response, response["model"])
            elif response["model"] in DEEPSEEK_MODEL_SPECS or (response["model"], response.get("provider", None)) in DEEPSEEK_MODEL_SPECS:
                cost, usage = calculate_costs_deepseek(response, response["model"], response.get("provider", None))
            else:
                raise ValueError(f'Unknown model: {response["model"]}')
        elif response.get("model_version", None) is not None:
            if response["model_version"] in GOOGLE_GENAI_MODEL_SPECS:
                cost, usage = calculate_costs_google_genai(response, response["model_version"])
            else:
                raise ValueError(f'Unknown model: {response["model_version"]} ({llm_response_dir})')
        else:
            raise ValueError(f'Unknown model: {response}')
        response_count += 1
        total_cost += cost
        total_usage = add_dict(total_usage, usage)
        if turn1:
            break  # Only one response in turn1
    return response_count, total_cost, total_usage


def collect_results_baseline(base_dir: Path, extract_turn1: bool = False) -> dict:
    results = {}
    for exp_dir in tqdm(base_dir.iterdir()):
        if not exp_dir.is_dir():
            continue
        experiment_result = []
        for problem_id in PROBLEM_IDS:
            exp_name = exp_dir.name
            assert (exp_dir / f"llm_log_{problem_id}").is_dir(), f"Missing `llm_log` directory for {problem_id} ({exp_name})"
            assert (exp_dir / f"codes_history_{problem_id}.json").is_file(), f"Missing `codes_history` file for {problem_id} ({exp_name})"
            assert (exp_dir / f"llm_history_{problem_id}.json").is_file(), f"Missing `llm_history` file for {problem_id} ({exp_name})"
            assert (exp_dir / f"log_{problem_id}.txt").is_file(), f"Missing `log` file for {problem_id} ({exp_name})"
            # assert (exp_dir / f"private_result_{problem_id}.json").is_file(), f"Missing `session` file for {problem_id} ({exp_name})"
            assert (exp_dir / f"session_{problem_id}.json").is_file(), f"Missing `session` file for {problem_id} ({exp_name})"
            llm_log_files = list((exp_dir / f"llm_log_{problem_id}").glob("response_*.json"))
            if len(llm_log_files) == 0:
                if "deepseek-v3" not in exp_name:
                    print(f"Empty response file for {problem_id} ({exp_name})")
            response_count, total_cost, total_usage = aggregate_usage(exp_dir / f"llm_log_{problem_id}", turn1=extract_turn1)
            codes_history = json.load((exp_dir / f"codes_history_{problem_id}.json").open("r"))
            llm_history = json.load((exp_dir / f"llm_history_{problem_id}.json").open("r"))
            log_text = (exp_dir / f"log_{problem_id}.txt").read_text()
            session_info = json.load((exp_dir / f"session_{problem_id}.json").open("r"))
            performance = MINIMUM_PERFORMANCES[problem_id]
            if (exp_dir / f"private_result_{problem_id}.json").is_file():
                private_result = json.load((exp_dir / f"private_result_{problem_id}.json").open("r"))
                if len(log_text.splitlines()) > 1:
                    if "four_hours" in base_dir.name:
                        performance = private_result["performance"]
                        if "Private evaluation reran." in log_text:
                            codes_history = codes_history[:-1]  # Remove the last code because of the timeout
                    else:
                        print(f"More than 1 line in log file for {problem_id} ({exp_name})")  # Fill with minimum performance
                elif not extract_turn1 or len(llm_log_files) == 1:
                    performance = private_result["performance"]  # Not turn1 setting or AC with first response
            result_dict = {
                "problem_id": problem_id,
                "performance": performance,
                "private_result": private_result["private_result"],
                "response_count": response_count,
                "total_cost": total_cost,
                "total_usage": total_usage,
                "codes_history": codes_history,
                "llm_history": llm_history,
                "session_resource_usage": session_info["current_resource_usage"],
                "session_action_log": session_info["action_log"],
                "experiment_started_at": datetime.datetime.fromtimestamp(session_info["session_started_at"]),
            }
            experiment_result.append(result_dict)
        results[exp_name] = experiment_result
    return results


def collect_results_openhands(base_dir: Path) -> tuple[str, list[dict]]:
    experiment_str, model_name = base_dir.name.split("_")
    assert experiment_str == "experiments", f"Invalid experiment string: {base_dir.name}"
    experiment_result = []
    for problem_id in PROBLEM_IDS:
        exp_dir = base_dir / f"{model_name}-{problem_id}_cpp20"
        assert exp_dir.is_dir(), f"Missing directory for {problem_id} ({model_name})"
        assert (exp_dir / "file_store").is_dir() and (exp_dir / "llm_completions").is_dir(), f"Missing `file_store` or `llm_completions` directory for {problem_id} ({model_name})"
        llm_completions = list((exp_dir / "llm_completions").glob("*.json"))
        response_count = len(llm_completions)
        total_cost = 0
        for llm_completion in llm_completions:
            total_cost += json.load(open(llm_completion))["cost"]
        submission_file = exp_dir / "workspace" / "solution.cpp"
        assert submission_file.is_file(), f"Missing `solution.cpp` file for {problem_id} ({model_name})"
        codes_history = json.load((exp_dir / f"codes_history_{problem_id}.json").open("r"))
        assert (exp_dir / f"log_{problem_id}.txt").is_file(), f"Missing `log` file for {problem_id} ({model_name})"
        private_result = json.load((exp_dir / f"private_result_{problem_id}.json").open("r"))
        session_info = json.load((exp_dir / f"session_{problem_id}.json").open("r"))
        trajectory = json.load(open(exp_dir / "trajectory.json"))
        if "tool_call_metadata" not in trajectory[-1] or \
            "function_name" not in trajectory[-1]["tool_call_metadata"] or \
            trajectory[-1]["tool_call_metadata"]["function_name"] != "finish":
            print(f"Invalid trajectory for {problem_id} ({model_name})")
        result_dict = {
            "problem_id": problem_id,
            "performance": private_result["performance"],
            "private_result": private_result["private_result"],
            "response_count": response_count,
            "total_cost": total_cost,
            "codes_history": codes_history,
            "session_resource_usage": session_info["current_resource_usage"],
            "session_action_log": session_info["action_log"],
            "experiment_started_at": datetime.datetime.fromtimestamp(session_info["session_started_at"]),
            "trajectory": trajectory,
        }
        experiment_result.append(result_dict)
    return f"{model_name}_cpp20", experiment_result


def collect_results_fishylene(base_dir: Path) -> dict:
    condition_str, model_name = base_dir.name.split("_")
    experiment_result = []
    for problem_id in PROBLEM_IDS_LITE:
        exp_dir = base_dir / problem_id
        assert exp_dir.is_dir(), f"Missing directory for {problem_id} ({model_name})"
        assert (exp_dir / "history").is_dir(), f"Missing `history` directory for {problem_id} ({model_name})"
        assert (exp_dir / "states").is_dir(), f"Missing `states` directory for {problem_id} ({model_name})"
        assert (exp_dir / "usage").is_dir(), f"Missing `usage` directory for {problem_id} ({model_name})"
        assert (exp_dir / "best.cpp").is_file(), f"Missing `best.cpp` file for {problem_id} ({model_name})"
        assert (exp_dir / "log.txt").is_file(), f"Missing `log` file for {problem_id} ({model_name})"
        assert (exp_dir / f"private_result_{problem_id}.json").is_file(), f"Missing `private_result` file for {problem_id} ({model_name})"
        private_result = json.load((exp_dir / f"private_result_{problem_id}.json").open("r"))
        usages = list((exp_dir / "usage").glob("*.json"))
        total_usage, total_cost = {}, 0
        for usage_file in usages:
            cost, usage = calculate_costs_google_genai({"usage_metadata": json.load(usage_file.open("r"))}, "gemini-2.5-pro-preview-03-25")
            total_usage = add_dict(total_usage, usage)
            total_cost += cost
        result_dict = {
            "problem_id": problem_id,
            "performance": private_result["performance"],
            "private_result": private_result["private_result"],
            "response_count": len(usages),
            "total_cost": total_cost,
            "total_usage": total_usage,
        }
        experiment_result.append(result_dict)
    return {f"{model_name}_cpp20": experiment_result}


def extract_lite_version_result(results: Result) -> Result:
    if len(results.case_results) < 1000:
        return results  # Short contests
    lite_version_cases = results.case_results[:len(results.case_results) // 10]
    return Result(
        allow_score_non_ac=results.allow_score_non_ac,
        resource_usage=results.resource_usage,
        case_results=lite_version_cases,
    )


def calculate_lite_performance(problem_id: str, full_result: dict) -> int:
    _problem, _seeds, standings, rank_performance_map, _data_root = load_problem(problem_id, True)
    lite_result = extract_lite_version_result(Result.model_validate(full_result))
    _new_rank, new_performance_rank, _relative_scores = standings.get_new_rank(lite_result)
    new_performance = rank_performance_map.get_performance(new_performance_rank)
    return new_performance
