import datasets
import pandas as pd

pd.set_option("max_colwidth", None)

OUTPUT_DIR = "output_gaia"


from huggingface_hub import login

login("YOUR_HF_TOKEN")

eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns(
    {"Question": "question", "Final answer": "true_answer", "Level": "task"}
)
eval_df = pd.DataFrame(eval_ds)

import glob

answer_file_path = f"{OUTPUT_DIR}/validation/answers.jsonl"

result_df = pd.concat(
    [
        pd.read_json(f, lines=True)
        for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl")
        if "answers.jsonl" not in f
    ]
)
result_df = result_df.drop(columns=["start_time", "end_time"])
result_df.to_json(answer_file_path, lines=True, orient="records")

import re
from collections import Counter

from scripts.evaluation.gaia_scorer import check_close_call, question_scorer

result_df["is_correct"] = result_df.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)
result_df["is_near_correct"] = result_df.apply(
    lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
    axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[
        eval_df["question"].apply(lambda x: x in question), "file_name"
    ]

    if len(matches) == 0:
        return "Not found"
    file_path = matches.values[0]

    if isinstance(file_path, str) and len(file_path) > 0:
        return file_path.split(".")[-1]
    else:
        return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

def get_thoughts(x):
    try:
        output = x[0]["task"]
        for y in x[1:]:
            try:
                if "observation" in y:
                    output += y["llm_output"] + "\nObservation:" + y["observation"]
                else:
                    output += y["llm_output"] + "\Error:" + str(y["error"])
            except:
                pass
        return output
    except:
        return None


result_df["thoughts"] = result_df["intermediate_steps"].apply(lambda x: get_thoughts(x))

version = "react_code_deepseek-r1-1.5b-qwen-distill-fp16"
list_versions = [
version
]

print("\n\n\n\n")
sel_df = result_df.loc[
    (result_df["agent_name"].isin(list_versions))
].reset_index(drop=True)
print(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
print(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)
assert sel_df["question"].value_counts().max() == len(list_versions), "Some questions are duplicate!"


sel_df.loc[
    (sel_df["is_correct"] == False) & (sel_df["is_near_correct"] == True),
    ["question", "prediction", "true_answer"],
]

print("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
print(
    sel_df.groupby(["agent_name", "task"])[
        ["is_correct", "is_near_correct", "count_steps", "question"]
    ]
    .agg(
        {
            "is_correct": "mean",
            "is_near_correct": "mean",
            "count_steps": "mean",
            "question": "count",
        }
    )
    .rename(columns={"question": "count"})
)

cumulative_df = (
    (
        sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
        .expanding(min_periods=1, axis=0, method="single")
        .agg({"is_correct": "mean", "is_near_correct": "count"})
        .reset_index()
    )
    .copy()
    .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
    try:
        res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[
            row["index"]
        ][:50]
        return res
    except Exception:
        return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)


sel_df.loc[
    (sel_df["agent_name"] == version) & (sel_df["question"].str.contains("Eva")),
    "question",
]

unsolved_questions = []


def was_question_solved(question):
    for agent in sel_df["agent_name"].unique():
        subselection = sel_df.loc[
            (sel_df["agent_name"] == agent) & (sel_df["question"] == question)
        ]
        if len(subselection) > 0 and subselection["is_correct"].iloc[0] == 1:
            return True
    return False


for question in sel_df.loc[sel_df["agent_name"] == version, "question"].unique():
    if not was_question_solved(question):
        unsolved_questions.append(question)

# From the solved one of the files, print numbers of L1 L2 L3 solved
# print statistics

with open("costs.txt", "r") as f:
    costs = f.readlines()

cost = []
for line in costs:
    cost.extend(line.split(","))

final = 0.0
for c in cost:
    try:
        final += float(c)
    except:
        print(c)
        continue
    
print(final)       
