import json
import os
import re
from pathlib import Path

import pandas as pd

from utils import file_utils


def parse_timings_for_experiment(exp_dir: Path) -> dict[str, float]:
    """
    Search for all 'timings.json' files under the given experiment directory,
    then for every key matching the pattern "task_<id>:<subkey>",
    sum the values (times) in the corresponding list.

    Returns:
        A dictionary mapping the <subkey> to its summed time.
        The dict is sorted alphabetically by <subkey>.
    """
    # Use the file_utils.find_files utility to get any timings.json files.
    timing_files = file_utils.find_files(exp_dir, "timings.json", upwards=False, downwards=True)
    results: dict[str, float] = {}

    for timing_file in timing_files:
        try:
            with open(timing_file, "r") as f:
                data = json.load(f)
        except Exception as e:
            print(f"Error reading {timing_file}: {e}")
            continue

        for key, values in data.items():
            # Make sure we're looking at keys with list values
            if not isinstance(values, list):
                continue

            # Check for keys that start with "task_<id>:"
            # For instance, "task_263:AGENT:call_llm" --> group key is "AGENT:call_llm"
            match = re.match(r"^task_\d+:(.+)$", key)
            if match:
                subkey = match.group(1)
                total_time = sum(values)
                results[subkey] = results.get(subkey, 0.0) + total_time

    # Return a new dictionary sorted by key
    sorted_results = dict(sorted(results.items()))
    return sorted_results


def parse_all_experiments(runs_dir: str, out_dir: str):
    """
    Find all experiment directories (those with an 'args.json') under root_dir.
    For each experiment, parse its timings and consolidate the data using pandas.
    The consolidated data is saved as "aggregated_timings.csv" in out_dir.
    """
    exp_dirs = file_utils.get_dirs_with_file(runs_dir, "args.json")
    if not exp_dirs:
        print(f"No experiment directories (with args.json) found in {runs_dir}")
        return

    consolidated = []
    record: dict = {}
    record["experiment"] = runs_dir
    for exp in exp_dirs:
        timings = parse_timings_for_experiment(exp)
        # Use the experiment directory name as an identifier.

        record.update(timings)
        consolidated.append(record)

    # Create a DataFrame from the consolidated data.
    df = pd.DataFrame(consolidated)
    # Fill missing values with 0.
    df = df.fillna(0)

    # Optionally, reorder the columns (keeping 'experiment' first, then sorting the others).
    cols = list(df.columns)
    cols.remove("experiment")
    cols.sort()
    df = df[["experiment"] + cols]

    # Create the output directory if it does not exist.
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    csv_path = out_path / "aggregated_timings.csv"
    df.to_csv(csv_path, index=False)

    print(f"Aggregated timings written to {csv_path}")


if __name__ == "__main__":
    runs_dir = "results/debug"
    out_dir = "results/aggregated"
    parse_all_experiments(runs_dir, out_dir)
