import argparse
import json
import os
import re
import subprocess
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from pathlib import Path
from typing import Callable

import numpy as np
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
from experiments_utils.cleaning_utils import delete_htmls
from experiments_utils.constants import (
    ALL_EXPERIMENTS_DIRNAME,
    DATA_PER_CONFIG_PATH,
    RUNS_DIRNAME,
    SCORES_SUMMARY_PATH,
    SINGLE_RUN_EXECUTION_DATA_FILENAME,
    SINGLE_RUN_SCORES_PER_ROUND_FILENAME,
    SINGLE_RUN_SUMMARY_FILENAME,
)
from experiments_utils.scripts.build_unfinished_task_list import consolidate_unfinished_tasks
from utils.file_utils import (
    add_annotation_to_path,
    copy_move,
    find_files,
    get_args,
    get_attribute_from_dict,
    get_domain_from_args_file,
    get_task_id_from_file_path,
    get_task_ids_from_csv,
    is_bottom_level_dir,
    remove_empty_dirs,
)
from utils.trace_utils import annotate_html_vwa, annotate_vwa_trace


# ==============================================================================
# LINK Helper functions
# ==============================================================================
def get_score_from_csv(csv_path: str, task_id: int):
    """Get the score from a csv file for a given task_id.

    Args:
        csv_path (str): The path to the csv file.
        task_id (int): The task_id to get the score for.

    Returns:
        float: The score for the given task_id.
    """
    if not csv_path:
        return None

    if not os.path.exists(csv_path):
        return None
    df = pd.read_csv(csv_path)
    task_id_int = int(task_id)
    df["task_id"] = df["task_id"].astype(int)

    if task_id_int not in df["task_id"].values:
        return None
    return df.loc[df["task_id"] == task_id_int, "score"].iloc[0]


def get_critique_round_scores(
    json_file_path: str,
    round_idx: int = 1,
) -> dict[int, dict[str, float]]:
    """Given a JSON with scores for the `critic-executor` loops per task ID, return all scores for the `round_idx` round of the loops.
    Args:
        json_file_path (str): the path to the JSON file formatted as {task_id:scores:[{state_idx, score, round}, {state_idx, score, round}, ...]}
        round_idx (int, optional): The i'th round of a critic-agent loop for a single state.

    Returns:
        dict[int, dict[str, float]]: {task_id: {score: score}}

    # NOTE: always return the scores for the first call to the critic.
    """

    # Read the JSON file
    with open(json_file_path, "r") as file:
        data = json.load(file)

    scores = {}

    # Iterate through each task and get the score observed for the `round_idx` round of the first call to the critic.
    for task_id, task_data in data.items():
        if "scores" in task_data:
            for score_entry in task_data["scores"]:
                if score_entry["round"] == round_idx:
                    scores[task_id] = {
                        "score": score_entry["score"],
                    }
                    break

    return scores


def update_all_scores_df(
    all_scores_df: pd.DataFrame,
    agent_results_df: pd.DataFrame,
    agent_name: str,
    domain: str,
    overwrite=True,
):
    """
    Update `all_scores_df` with scores from a specific agent and domain.
    """

    # Ensure basic columns exist.
    for col in ["domain", "task_id"]:
        if col not in all_scores_df.columns:
            all_scores_df[col] = ""

    # If the DataFrame is completely empty, create the base columns.
    if all_scores_df.empty:
        all_scores_df = pd.DataFrame(columns=["domain", "task_id", agent_name])

    # Create a column for the agent configuration if it doesn't exist.
    if agent_name not in all_scores_df.columns:
        all_scores_df[agent_name] = ""

    # Group the `agent_results_df` by task_id and get the maximum score (in case of duplicates).
    aggregated_scores = agent_results_df.groupby("task_id")["score"].max()

    # Identify rows in `all_scores_df` for the given domain.
    domain_mask = all_scores_df["domain"] == domain

    # Determine which task_ids in the current domain already exist.
    existing_task_ids = set(all_scores_df.loc[domain_mask, "task_id"])
    new_task_ids = set(aggregated_scores.index) - existing_task_ids
    common_task_ids = set(aggregated_scores.index) & existing_task_ids

    # Update `all_scores_df` with existing rows for common task_ids
    if common_task_ids:
        common_mask = domain_mask & all_scores_df["task_id"].isin(common_task_ids)
        # Map task_id to score from aggregated_scores and update the agent column.
        all_scores_df.loc[common_mask, agent_name] = all_scores_df.loc[common_mask, "task_id"].map(aggregated_scores)

    # Append new rows to `all_scores_df` for new task_ids
    if new_task_ids:
        # Filter for only task_ids that are new.
        new_rows = pd.DataFrame(
            {
                "task_id": aggregated_scores.index,
                "domain": domain,
                agent_name: aggregated_scores.values,
            }
        )
        new_rows = new_rows[new_rows["task_id"].isin(new_task_ids)]

        # If there are other columns already in all_scores_df, ensure they are present in new_entries.
        existing_cols = [col for col in all_scores_df.columns if col not in new_rows.columns]
        for col in existing_cols:
            if col not in new_rows.columns:
                new_rows[col] = ""

        # Ensure the column order matches
        new_rows = new_rows[all_scores_df.columns]
        all_scores_df = pd.concat([all_scores_df, new_rows], ignore_index=True)

    # Remove old scores for `agent_name` from `all_scores_df`, if they exist.

    if overwrite:
        mask_not_in = domain_mask & ~all_scores_df["task_id"].isin(aggregated_scores.index)
        all_scores_df.loc[mask_not_in, agent_name] = np.nan

    return all_scores_df


# ===============================================================================
# LINK Directory organization
# ===============================================================================


def mirror_folder_structure(
    single_run_dir: str | Path, save_dir: str | Path | None = None, key_file: str = "args.json"
) -> dict[str, str]:
    """Create a folder structure mirroring the folder structure of `run` directories identified by `key_file`.

    Args:
        single_run_dir (str): the path to a `run` directory with results and trace files.
        save_dir (str, optional): the path to the directory to create the mirrored folder structure.
                                  If None, the structure will be created in the parent directory of `single_run_dir`.
        key_file (str, optional): the file to use to identify the subfolders. Defaults to "args.json".

    Returns:
        dict[str, str]: A dictionary mapping folders created in `save_dir` to the subfolders they were mirrored from.

    Example:
    in:
       shopping  <--- `save_dir`
         results
           shopping_2025-02-02_12-55-23  <--- `single_run_dir`
             args.json
             htmls
             api_data
               critique
    out:
       shopping
         htmls   <--- mirrored from `shopping_2025-02-02_12-55-23/htmls`
         api_data
           critique
             conversation
         results

    """
    if not save_dir:
        save_dir = os.path.dirname(single_run_dir)

    src_to_tgt_dirs = {}
    for root, dirs, files in os.walk(single_run_dir):
        if key_file in files:
            # Recreate the folder structure on the root directory
            start_path = root  # Path to start recreating from (exclusive)

            for root_dir, dirs, files in os.walk(start_path):
                if root_dir == root:
                    continue

                # Calculate the target directory path
                target_dir = os.path.join(save_dir, os.path.relpath(root_dir, start_path))

                # If is bottom-level directory, append to list
                if is_bottom_level_dir(root_dir):
                    os.makedirs(target_dir, exist_ok=True)  # Create target directory if it doesn't exist
                    src_to_tgt_dirs[root_dir] = target_dir
            # Exit after processing the first folder with 'args.json'
            return src_to_tgt_dirs
    return src_to_tgt_dirs


def create_organize_folder_structure(
    all_experiments_folder,
    key_file="args.json",
    runs_dirname=RUNS_DIRNAME,
    exclude_dirs=[],
):
    """Create a common dir structure for all experiments in `all_experiments_folder`, and consolidate files from individual `runs` dirs into single directories associated to an experiment.
    Args:
        all_experiments_folder (str): the path to the dir containing all experiments.
        key_file (str, optional): the file to identify directories with experiment data for one run.
        runs_dirname (str, optional): the name of the dir containing the runs for an experiment.
        single_run_summary_filename (str, optional): the name of the file containing the summary of a run.
        exclude_dirs (list, optional): dirs to exclude from the consolidation. Defaults to [].
    """
    model_dirs = Path(all_experiments_folder).glob("*")
    # Iterate over each model folder
    for model_dir in model_dirs:  # gpt4o, gemini-1.5-flash, ...
        if not model_dir.is_dir():
            continue

        # Iterate over each Agent config folder
        for config_dir in model_dir.glob("*"):  # mem=text-t=1-u-a_critic=2p, etc
            if not config_dir.is_dir():
                continue

            if any(d in str(config_dir.resolve()) for d in exclude_dirs):
                continue

            # Get `args.json` files from all runs for this experiment
            args_files = find_files(config_dir, key_file, upwards=False, downwards=True)

            if len(args_files) == 0:
                continue

            # Get domains from all runs for this experiment
            domains = [get_domain_from_args_file(args_file) for args_file in args_files]

            # Create/Fill domain folders and 'runs' folders with each run of this particular experiment
            for domain, arg_file in zip(domains, args_files):
                # Create domain folder if it doesn't exist
                domain_dir = config_dir / domain
                os.makedirs(domain_dir, exist_ok=True)

                # Create 'runs' folder if it doesn't exist
                runs = domain_dir / runs_dirname
                os.makedirs(runs, exist_ok=True)

                # If dir with run data isn't in `domain_folder/runs_dirname`, move it there
                single_run_dir = Path(arg_file).parent
                if single_run_dir not in runs.iterdir():
                    single_run_dir = copy_move(
                        single_run_dir,
                        runs / single_run_dir.name,
                        mode="move",
                        merge_dirs=False,
                        copy_move_only_new=False,
                        overwrite_file=False,
                    )

                # Mirror folder structure from a `run` dir
                src_to_tgt_dirs = mirror_folder_structure(
                    single_run_dir=single_run_dir,
                    save_dir=domain_dir,
                    key_file=key_file,
                )
                if not src_to_tgt_dirs:
                    continue
                # Move trace files from the `run` dirs to the dirs in the parent directory
                # Example: move all 'htmls' scattered through `runs` to `domain_dir/htmls`
                move_trace_files(
                    src_to_tgt_dirs.keys(),
                    src_to_tgt_dirs.values(),
                    annotate_traces=True,
                    domain=domain,
                )
    remove_empty_dirs(all_experiments_folder)


def move_trace_files(
    source_dirs,
    target_dirs,
    annotate_traces=True,
    single_run_summary_filename=SINGLE_RUN_SUMMARY_FILENAME,
    domain="",
    parallel=True,
):
    """
    Copy or move trace files from source directories to target directories, parallelizing operations per trace file.

    This version uses a thread pool to concurrently process all trace files.
    """

    ann_template = f"SCORE:{{score:.0f}}\nSOURCE:{{source}}\nDOMAIN:{domain}"
    pin_annotations = [True, False, False]

    def annotate_copy_move_job(file, target_dir, source_dir, csv_path_, annotate_traces, ann_template, pin_annotations):
        """
        Annotate and copy/move a **single** trace file
        """
        task_id = get_task_id_from_file_path(file)
        dest_path = Path(target_dir) / file.name

        if annotate_traces:
            score = None if not csv_path_ else get_score_from_csv(csv_path_, int(task_id))
            if score is None:
                dest_path = Path(add_annotation_to_path(dest_path, f"failed/", add_to_end=False))
            score = score if score is not None else np.nan
            ann_list = ann_template.format(score=score, source=source_dir).split("\n")
            annotate_vwa_trace(
                trace_path=str(file),
                annotations=ann_list,
                pin_annotations=pin_annotations,
                overwrite=True,
                write_to_file=True,
            )
        os.makedirs(dest_path.parent, exist_ok=True)
        copy_move(
            file,
            dest_path,
            mode="move",
            merge_dirs=True,
            overwrite_file=False,
            copy_move_only_new=False,
        )

    # Build a list of jobs for all trace files.
    jobs = []
    for source_dir, target_dir in zip(source_dirs, target_dirs):
        csv_path_ = None
        if annotate_traces:
            csv_path = find_files(source_dir, single_run_summary_filename, upwards=True, downwards=False)
            if not csv_path:
                print(f"No {single_run_summary_filename} found for {source_dir}. Marking all traces as failed.")
            else:
                csv_path_ = csv_path[0]

        for file in Path(source_dir).iterdir():
            if not file.is_file():
                continue

            # Only add jobs for files which yield a valid task_id.
            task_id = get_task_id_from_file_path(file)
            if not task_id:
                continue

            # Each job tuple contains all parameters needed to annotate and copy/move a file.
            jobs.append(
                (
                    file,
                    target_dir,
                    source_dir,
                    csv_path_,
                    annotate_traces,
                    ann_template,
                    pin_annotations,
                )
            )

    if parallel:
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(annotate_copy_move_job, *job) for job in jobs]
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    print(f"Error processing file: {e}")
    else:
        for job in jobs:
            try:
                annotate_copy_move_job(*job)
            except Exception as e:
                print(f"Error processing file: {e}")


# ===============================================================================
# LINK Parsing functions for data of single experiment runs
# ===============================================================================


def parse_scores_per_round(
    scores_per_round_json_path: str,
    single_run_summary_filename: str = SINGLE_RUN_SUMMARY_FILENAME,
    round_idx=0,
) -> pd.DataFrame | None:
    """
    Given a single 'scores_per_round.json' file, parse the no‐critique DataFrame.
    Return None if we should skip this file (e.g. no critique_agent found, or missing data).
    """
    dirname = os.path.dirname(scores_per_round_json_path)
    csv_path = Path(dirname) / single_run_summary_filename
    args = get_args(dirname)
    if (
        not args
        or not args.get("agents_configs", None)
        or not any("critique_agent" in agent for agent in args["agents_configs"])
        or not os.path.exists(csv_path)
    ):
        return None

    # Get the scores for the first round and call of the execution
    first_round_scores = get_critique_round_scores(scores_per_round_json_path, round_idx=round_idx)
    scores_no_critique = pd.DataFrame.from_dict(first_round_scores, orient="index")
    scores_no_critique.reset_index(inplace=True)
    scores_no_critique.rename(columns={"index": "task_id"}, inplace=True)

    # Identify tasks where execution of Critic mode failed, and remove those from the no-critic scores
    df_scores = pd.read_csv(csv_path)
    scores_no_critique = compatibilize_crit_nocrit_scores(
        crit_scores_df=df_scores,
        no_crit_scores_df=scores_no_critique,
    )
    return scores_no_critique


def parse_summary_data_file(summary_data_csv_path: str) -> pd.DataFrame | None:
    """Parse the summary data file.

    Args:
        summary_data_csv_path (str): the path to the summary data file.

    Returns:
        pd.DataFrame | None: the parsed summary data.
    """
    if not os.path.exists(summary_data_csv_path):
        return None

    df = pd.read_csv(summary_data_csv_path)
    df["task_id"] = df["task_id"].astype(int)

    return df


def parse_execution_data_file(
    execution_data_csv_path: str, single_run_summary_filename: str = SINGLE_RUN_SUMMARY_FILENAME
) -> pd.DataFrame | None:
    """Parse the execution data file.

    Args:
        execution_data_csv_path (str): the path to the execution data file.
        single_run_summary_filename (str, optional): the name of the file containing the summary of a run.

    Returns:
        pd.DataFrame | None: the parsed execution data.
    """
    if not os.path.exists(execution_data_csv_path):
        return None

    # Read csv with execution data
    execution_df = pd.read_csv(execution_data_csv_path)
    execution_dir_name = os.path.dirname(execution_data_csv_path)

    # Add a column to identify the source directory
    execution_df["source"] = execution_dir_name

    # Add attributes from `args.json` to the dataframe
    args_file = get_args(execution_dir_name)
    if args_file:
        execution_df["captioning_model_device"] = get_attribute_from_dict("captioning_model_device", args_file)
        execution_df["model_name"] = get_attribute_from_dict("model_name", args_file)
        execution_df["provider"] = get_attribute_from_dict("provider", args_file)
        execution_df["agent"] = get_attribute_from_dict("model_name", args_file)

    # Add attributes from `summary_data.csv` to the dataframe
    executed_tasks = get_task_ids_from_csv(Path(execution_dir_name) / single_run_summary_filename)
    if executed_tasks:
        execution_df["n_failed_executions"] = execution_df["n_tasks"] - len(executed_tasks)

    return execution_df


def parse_timings(timings_json_file: str) -> pd.DataFrame | None:
    timing_file = Path(timings_json_file)
    if not timing_file.is_file():
        return None

    with open(timing_file, "r") as f:
        timings_data = json.load(f)

    # Dictionary to accumulate timings per task id.
    # Key: task_id (as string), Value: dict with {metric: total_time}
    results = {}

    for key, values in timings_data.items():
        if not isinstance(values, list):
            continue
        # Use regex to extract task_id and metric.
        # For example, a key like "shopping-task_263:AGENT:call_llm" will capture:
        #    group(1) -> "263" (the task id)
        #    group(2) -> "AGENT:call_llm" (the metric)
        match = re.match(r".*-(\d+):(.+)$", key)
        if match:
            task_id = match.group(1)
            metric = match.group(2).strip()
            total_time = sum(values)
            if task_id not in results:
                results[task_id] = {}
            # If the same metric appears more than once for a task, sum the times.
            results[task_id][metric] = results[task_id].get(metric, 0.0) + total_time

    # If no valid timings were found, return None.
    if not results:
        return None

    # Build a DataFrame with one row per task.
    rows = []
    for task_id, metrics in results.items():
        row = {"task_id": int(task_id)}
        row.update(metrics)
        rows.append(row)

    # Sort dataframe columns
    sorted_columns = sorted(rows[0].keys())
    return pd.DataFrame(rows)[sorted_columns]


# ===============================================================================
# LINK Consolidation of data given runs_dir with all runs
# ===============================================================================
def consolidate_unfinished_in_config_dir(runs: str | Path, save_dir: str | Path, save_to_config_dir: bool = True):
    """Consolidate txt files with unfinished tasks in the `runs` dir into a single file in the `save_dir` dir.

    Args:
        runs (str | Path): Path to the directory containing experiment runs.
        save_dir (str | Path): Path to the directory to save the consolidated unfinished tasks.
        save_to_config_dir (bool, optional): Whether to save the consolidated file in the config directory. Defaults to True.
    """
    if save_to_config_dir:
        is_domain_folder = False
        save_dir = Path(save_dir)

        if any(RUNS_DIRNAME in d for d in list(str(d) for d in save_dir.iterdir())):
            is_domain_folder = True

        if is_domain_folder:
            save_dir = Path(save_dir).parent
    consolidate_unfinished_tasks(runs, save_dir)


def consolidate_runs_data(
    runs: str | Path,
    filename_pattern: str,
    parse_data_func: Callable,
    save_dir: str = "",
    out_file_template: str = "consolidated_{out_file_no_ext}.csv",
):
    # Get all files to consolidate in the runs with the given data_filename
    data_files = find_files(runs, filename_pattern, upwards=False, downwards=True)

    if not data_files:
        print(f"No `{filename_pattern}` files found in `{runs}`. Skipping consolidation.")
        return

    common_paths = [os.path.commonpath(data_files) if len(data_files) > 1 else runs]
    if Path(common_paths[0]) != Path(runs):
        # No need to consider this case - enforce that `runs` contains data for only one experiment configuration
        raise NotImplementedError(
            f"The given folder {runs} seem to contain runs from multiple experiments. "
            "In the future, this function will do a best effort consolidation of data from these experiments and save on the corresponding parent folders or 'save_dir/parent_dir_basename'. "
            "However, we advise to change your code to apply this function atomically, that is, apply it to the runs of ONE experiment at a time."
        )
        # # Get the longest common paths and try to consolidate experiments for all these directories.
        # # Ignores any dirs not relative to runs.
        # common_paths, _ = get_common_paths(data_files, relative_to=runs)
        # if len(common_paths) >= 1:
        #     warnings.warn(f"The following are directories that will be consolidated: {common_paths}")
        # else:
        #     warnings.warn(f"No common parent dirs found for the given folder {runs}. Skipping consolidation.")
        #     return

    # Assign a save_dir to each parent_dir
    data_file_to_save_dir = {}

    if len(common_paths) == 1:
        save_dir = os.path.dirname(save_dir) if not save_dir else save_dir
        data_file_to_save_dir = {data_file: save_dir for data_file in data_files}
    else:
        # No need to consider this case - enforce that `runs` contains data for only one experiment configuration
        raise NotImplementedError(
            f"The given folder {runs} seem to contain runs from multiple experiments. "
            "In the future, this function will do a best effort consolidation of data from these experiments and save on the corresponding parent folders or 'save_dir/parent_dir_basename'. "
            "However, we advise to change your code to apply this function atomically, that is, apply it to the runs of ONE experiment at a time."
        )

    # Get all args.json present in the directories of each data_file. Obs.: get_args returns empty dict if no args.json is found.
    args_files = [get_args(os.path.dirname(data_file)) for data_file in data_files]

    # save_dir to df
    save_dir_to_df = {}

    # Iterate over each folder with run data and consolidate data among runs of the same experiment
    for data_file, args in zip(data_files, args_files):
        # Enforce folder runs to contain args.json; else, run is skipped.
        if not args:
            continue

        parsed_df = parse_data_func(data_file)
        if not isinstance(parsed_df, pd.DataFrame):
            continue

        s_dir = data_file_to_save_dir[data_file]
        # Update the consolidated dataframe for the current save_dir
        if s_dir not in save_dir_to_df:
            save_dir_to_df[s_dir] = pd.DataFrame()

        save_dir_to_df[s_dir] = pd.concat([save_dir_to_df[s_dir], parsed_df], ignore_index=True)

    for s_dir, df in save_dir_to_df.items():
        if "task_id" in df.columns:
            # Sort by task_id and save csv to the corresponding save_dir
            df = df.sort_values(by="task_id")

        out_file_no_ext = Path(filename_pattern).stem
        df.to_csv(Path(s_dir) / out_file_template.format(out_file_no_ext=out_file_no_ext), index=False)


def compatibilize_crit_nocrit_scores(
    crit_scores_df: pd.DataFrame,
    no_crit_scores_df: pd.DataFrame,
) -> pd.DataFrame:
    # Identify tasks where execution of Critic mode failed, and remove those from the no-critic scores
    crit_scores_df["task_id"] = crit_scores_df["task_id"].astype(int)
    crit_scores_df = crit_scores_df.dropna(subset=["score"])
    crit_scores_df = crit_scores_df.set_index("task_id")

    no_crit_scores_df["task_id"] = no_crit_scores_df["task_id"].astype(int)
    no_crit_scores_df = no_crit_scores_df.set_index("task_id")

    # (i) Maintain in no-critique scores only tasks that were executed in Critic mode (i.e., non-failed executions)
    no_crit_scores_df = no_crit_scores_df[no_crit_scores_df.index.isin(crit_scores_df.index)]

    # (ii) If `crit_scores_df` has a score, but `no_crit_scores_df` does not, add it.
    # Can happen when max actions exceeded.
    missing_tasks = set(crit_scores_df.index) - set(no_crit_scores_df.index)
    if missing_tasks:
        # Get additional rows for the missing tasks with all available columns
        additional_rows = crit_scores_df.loc[list(missing_tasks)]
        # Identify common columns between no_crit_scores_df and additional_rows
        common_cols = no_crit_scores_df.columns.intersection(additional_rows.columns)

        # Reindex the additional_rows to match the columns of no_crit_scores_df;
        # missing columns will be filled with NaN.
        additional_rows = additional_rows.reindex(columns=common_cols)
        # Append the new rows to the no-critic scores DataFrame.
        no_crit_scores_df = pd.concat([no_crit_scores_df, additional_rows])

    no_crit_scores_df = no_crit_scores_df.reset_index()
    return no_crit_scores_df


# ========================================================================
# LINK Main script
# ========================================================================
def consolidate_all_experiments(
    all_experiments_folder,
    functions,
    runs_dirname,
    scores_summary_path,
    data_per_config_filename,
    single_run_summary_filename,
    single_run_scores_round_filename,
    update_scores_summary=True,
    exclude_dirs=[],
):
    """Script to organize folder structure and consolidate data from all experiments in a single folder.
    It pieces together all functions in this module for this purpose.

    Args:
        all_experiments_folder (str): Path to the directory containing all experiments.
        functions (list): List of functions to apply to the data.
        runs_dirname (str): Name of the directory containing the runs.
        scores_summary_path (str): Path to the file containing the scores summary.
        data_per_config_filename (str): Path to the file containing the data per config.
    """

    # Create folder structure
    create_organize_folder_structure(
        all_experiments_folder=all_experiments_folder,
        key_file="args.json",
        runs_dirname=runs_dirname,
        exclude_dirs=exclude_dirs,
    )

    # Create csv file with summary of scores of all experiments configs
    if update_scores_summary:
        if not os.path.exists(scores_summary_path):
            all_scores_df = pd.DataFrame()
        else:
            all_scores_df = pd.read_csv(scores_summary_path)

    # Create csv file with data for all experiments configs
    data_per_config_df = pd.DataFrame()

    # Iterate over subdirectories in experiments_folder
    for model_folder in Path(all_experiments_folder).iterdir():  # gpt4o, gemini-1.5-flash, ...
        if not model_folder.is_dir():
            continue

        # Iterate over subdirectories in model_folder
        for config_folder in model_folder.iterdir():  # base-stateaware, flash-base_stateaware, ...
            if not config_folder.is_dir():
                continue

            # Skip if config_folder is in exclude_dirs
            if any(d in str(config_folder.resolve()) for d in exclude_dirs):
                continue

            # Get args files from all runs for this experiment
            args_files = find_files(config_folder, "args.json", upwards=False, downwards=True)
            if len(args_files) == 0:
                continue

            # Get domains from all runs for this experiment
            domains = set(get_domain_from_args_file(args_file) for args_file in args_files)

            # Iterate domains
            for domain in domains:  # shopping, gitlab, ...
                domain_folder = config_folder / domain
                if not domain_folder.is_dir():
                    print(f"Domain folder not found: {domain_folder} for {args_files}")
                    continue

                # Call functions to consolidate data from `runs`
                for function in functions:
                    function(domain_folder / runs_dirname, save_dir=domain_folder)

                # Update summary of scores with the current model-config-domain scores
                full_config_name = f"{model_folder.name}/{config_folder.name}"
                if update_scores_summary:
                    # If the domain folder has e.g.: `consolidated_summary.csv` file, update the summary of scores for the corresponding model-config-domain
                    consolidated_data_csv_path = Path(domain_folder) / f"consolidated_{single_run_summary_filename}"
                    if consolidated_data_csv_path.exists():
                        results_df = pd.read_csv(consolidated_data_csv_path)
                        all_scores_df = update_all_scores_df(all_scores_df, results_df, full_config_name, domain)

                    scores_per_round_csv_path = Path(domain_folder) / Path(
                        f"consolidated_{single_run_scores_round_filename}"
                    ).with_suffix(".csv")

                    if scores_per_round_csv_path.exists():
                        no_crit_scores_df = pd.read_csv(scores_per_round_csv_path)
                        no_crit_scores_df = compatibilize_crit_nocrit_scores(
                            crit_scores_df=results_df,
                            no_crit_scores_df=no_crit_scores_df,
                        )

                        full_config_name = f"{model_folder.name}/{config_folder.name}_no_crit"
                        all_scores_df = update_all_scores_df(all_scores_df, no_crit_scores_df, full_config_name, domain)

                    # Update data_per_config with the current agent's data
                    results_df["domain"], results_df["config_name"] = domain, full_config_name
                    data_per_config_df = pd.concat([data_per_config_df, results_df], ignore_index=True)

            # Filter rows from data_per_config_df that belong to the current config.
            config_summary_df = data_per_config_df[data_per_config_df["config_name"].str.contains(full_config_name)]
            if not config_summary_df.empty:
                # Optionally sort the summary by domain and task_id before saving
                config_summary_df = config_summary_df.sort_values(by=["domain", "task_id"])
                # Add column with full path to trace file
                config_summary_df["trace_path"] = config_summary_df.apply(
                    lambda row: str(config_folder / row["domain"] / "htmls" / f"render_{row['task_id']}.html"),
                    axis=1,
                )

                config_scores_summary_path = config_folder / "scores_summary.csv"
                config_summary_df.to_csv(config_scores_summary_path, index=False)
                print(f"Scores summary for config {full_config_name} saved to: {config_scores_summary_path}")

    # Save summary of scores and data per config
    if update_scores_summary:
        all_scores_df.to_csv(scores_summary_path, index=False)
        print(f"All scores summary saved to: {scores_summary_path}")

    data_per_config_df.sort_values(by=["config_name", "domain", "task_id"], inplace=True)
    data_per_config_df.to_csv(data_per_config_filename, index=False)
    print(f"All data per config saved to: {data_per_config_filename}")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--s", "--sanity_check", action="store_true", help="Run sanity check on experiments.", dest="sanity_check"
    )
    parser.add_argument(
        "--e",
        "--exclude_dirs",
        nargs="+",
        default=["zzIgnore", "zzOld"],
        help="List of directories to exclude from consolidation.",
        dest="exclude_dirs",
    )
    parser.add_argument(
        "--u",
        "--update_scores_summary",
        type=bool,
        default=True,
        help="Flag to indicate whether to update the scores summary.",
        dest="update_scores_summary",
    )
    parser.add_argument(
        "--sa",
        "--save_to_analysis",
        type=bool,
        default=True,
        help="Flag to indicate whether to save the data to the analysis spreadsheet.",
        dest="save_to_analysis",
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    # overried arguments if interactive debugging activated
    if sys.gettrace() is not None:
        args.save_to_analysis = False
        args.sanity_check = False
    if args.sanity_check:
        print("Sanity check enabled. Creating hashes for initial dir.")
        from utils.file_utils import SanityChecker

        sanity_checker = SanityChecker(num_processes=-1)
        sanity_checker.set_original_hashes(ALL_EXPERIMENTS_DIRNAME)

    funs = [
        delete_htmls,
        partial(
            consolidate_runs_data,
            filename_pattern=SINGLE_RUN_SUMMARY_FILENAME,
            parse_data_func=parse_summary_data_file,
        ),
        partial(
            consolidate_runs_data,
            filename_pattern=SINGLE_RUN_SCORES_PER_ROUND_FILENAME,
            parse_data_func=parse_scores_per_round,
        ),
        partial(
            consolidate_runs_data,
            filename_pattern=SINGLE_RUN_EXECUTION_DATA_FILENAME,
            parse_data_func=parse_execution_data_file,
        ),
        partial(
            consolidate_unfinished_in_config_dir,
            save_to_config_dir=True,
        ),
        partial(
            consolidate_runs_data,
            filename_pattern="timings.json",
            parse_data_func=parse_timings,
        ),
    ]
    consolidate_all_experiments(
        all_experiments_folder=ALL_EXPERIMENTS_DIRNAME,
        functions=funs,
        runs_dirname=RUNS_DIRNAME,
        scores_summary_path=SCORES_SUMMARY_PATH,
        data_per_config_filename=DATA_PER_CONFIG_PATH,
        single_run_summary_filename=SINGLE_RUN_SUMMARY_FILENAME,
        single_run_scores_round_filename=SINGLE_RUN_SCORES_PER_ROUND_FILENAME,
        update_scores_summary=args.update_scores_summary,
        exclude_dirs=args.exclude_dirs,
    )

    if args.sanity_check:
        print("Sanity chek step: Creating hashes for final dir.")
        sanity_checker.set_new_hashes(ALL_EXPERIMENTS_DIRNAME)
        sanity_checker.sanity_check()

    # Run the update_analysis_spreadsheet.py script
    if args.save_to_analysis:
        script_path = os.path.join(os.path.dirname(__file__), "update_analysis_spreadsheet.py")
        subprocess.check_call([sys.executable, script_path, SCORES_SUMMARY_PATH])


# scores_shopping = parse_scores_per_round(
#     scores_per_round_json_path="self_consistency/scores_per_round_shopping.json",
#     single_run_summary_filename="summary_data_shopping.csv",
# )

# scores_classifieds = parse_scores_per_round(
#     scores_per_round_json_path="self_consistency/scores_per_round_classifieds.json",
#     single_run_summary_filename="summary_data_classifieds.csv",
# )


# # To csv
# scores_shopping.to_csv("scores_shopping.csv", index=False)
# scores_classifieds.to_csv("scores_classifieds.csv", index=False)
