"""
This is just the evaluation code which the Berkeley Function Calling Leaderboard uses.
"""

import ast
import json
import logging
from typing import Any, Dict, List, cast, get_args

from inspect_ai import Task, dataset, task
from inspect_ai.dataset import Sample
from inspect_ai.model import ChatMessageAssistant
from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import (
    Generate,
    Solver,
    TaskState,
    solver,
)
from inspect_ai.tool import ToolInfo, ToolParam, ToolParams  # type: ignore
from inspect_ai.util import JSONType

logger = logging.getLogger(__name__)

DATASET_PATH = "gorilla-llm/Berkeley-Function-Calling-Leaderboard"


@solver
def bfcl_solver() -> Solver:
    async def solve(state: TaskState, generate: Generate) -> TaskState:
        tool_infos: List[ToolInfo] = []

        for tool_spec in state.metadata["tools"]:
            tool_info = create_tool_info_from_dict(tool_spec)
            tool_infos.append(tool_info)

        state.tools.extend(tool_infos)  # type: ignore
        return await generate(state, tool_calls="none")

    return solve


@task
def bfcl(solver: Solver | list[Solver] = bfcl_solver()) -> Task:
    ds = dataset.hf_dataset(
        DATASET_PATH,
        split="train",
        sample_fields=record_to_sample,
        # main branch does not load cleanly into an HF dataset so we use a PR branch which fixes it
        # see https://huggingface.co/datasets/gorilla-llm/Berkeley-Function-Calling-Leaderboard/discussions/15
        revision="1bf8bbc3c0e35d04d00339c223a3fd653aa195ac",
        name="exec_simple",
    )
    return Task(dataset=ds, solver=solver, scorer=bfcl_scorer())


@scorer([accuracy()])
def bfcl_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        assistant_messages = [
            m for m in state.messages if isinstance(m, ChatMessageAssistant)
        ]

        if len(assistant_messages) == 0:
            return Score(value="I", answer="No assistant message")
        elif len(assistant_messages) != 1:
            return Score(
                value="I",
                answer=f"Expected just 1 assistant message, got {len(assistant_messages)}",
            )

        message = assistant_messages[0]

        tool_calls = message.tool_calls

        if tool_calls is None or len(tool_calls) != 1:
            return Score(value="I", answer=f"tool calls: {repr(tool_calls)}")

        target_obj = state.metadata["target_obj"]

        args_identical = tool_calls[0].arguments == target_obj["arguments"]
        function_identical = tool_calls[0].function == target_obj["function"]
        logger.info(
            f"args: {tool_calls[0].arguments} == {target_obj['arguments']}\nfunction: {tool_calls[0].function} == {target_obj['function']}"
        )
        is_identical = args_identical and function_identical
        value = "C" if is_identical else "I"

        tool_call_string = tool_call_to_string(
            tool_calls[0].function, tool_calls[0].arguments
        )
        return Score(value=value, answer=repr(tool_call_string))

    return score


def record_to_sample(record: dict[str, Any]) -> Sample:
    assert len(record["question"]) == 1
    assert len(record["ground_truth"]) == 1
    target = record["ground_truth"][0]

    parsed_target = parse_target(target)
    formatted_target = tool_call_to_string(
        parsed_target["function"], parsed_target["arguments"]
    )

    # the dataset contains tuples and lists, to simplify comparing these we convert them to lists by running them through json serialization
    jsoned_target = json.loads(json.dumps(parsed_target))
    return Sample(
        input=record["question"][0],
        target=formatted_target,
        metadata={"tools": record["function"], "target_obj": jsoned_target},
    )


def get_type(bfcl_type: str | None) -> JSONType | None:
    if bfcl_type is None:
        return None

    if bfcl_type == "dict":
        return "object"

    if bfcl_type == "float":
        bfcl_type = "number"

    if bfcl_type == "tuple":
        bfcl_type = "array"

    assert bfcl_type in get_args(JSONType), f"Invalid type: {bfcl_type}"
    return cast(JSONType, bfcl_type)


def create_tool_param(param_dict: Dict[str, Any] | None) -> ToolParam | None:
    """Helper function to create ToolParam instances recursively"""
    if param_dict is None:
        return None

    # Handle nested properties
    properties = None
    if param_dict.get("properties"):
        properties = {
            key: create_tool_param(value)
            for key, value in param_dict["properties"].items()
            if value is not None
        }

    # Handle array items
    items = None
    if param_dict.get("items"):
        items = create_tool_param(param_dict["items"])

    return ToolParam(
        type=get_type(param_dict.get("type")),
        description=param_dict.get("description"),
        default=param_dict.get("default"),
        enum=param_dict.get("enum"),
        items=items,
        properties=properties,  # type: ignore
        additionalProperties=param_dict.get("additionalProperties"),
        required=param_dict.get("required"),
    )


def create_tool_info_from_dict(tool_dict: Dict[str, Any]) -> ToolInfo:
    """
    Create a ToolInfo instance from a dictionary.

    Args:
        tool_dict: Dictionary containing tool information

    Returns:
        ToolInfo instance
    """
    # Create the parameters object
    parameters = None
    if "parameters" in tool_dict:
        parameters = create_tool_param(tool_dict["parameters"])

    assert parameters is not None
    assert parameters.properties is not None
    assert "additionalProperties" not in parameters.properties

    tool_params = ToolParams(
        properties=parameters.properties,
        required=parameters.required or [],
    )
    # Create and return the ToolInfo instance
    return ToolInfo(
        name=tool_dict["name"],
        description=tool_dict["description"],
        parameters=tool_params,
    )


def tool_call_to_string(function_name: str, arguments: dict[str, Any]) -> str:
    args_str = ", ".join(f"{k}={v}" for k, v in arguments.items())
    return f"{function_name}({args_str})"


def parse_target(target: str) -> dict[str, Any]:
    parsed = ast.parse(target, mode="single")
    assert len(parsed.body) == 1
    body = parsed.body[0]
    assert isinstance(body, ast.Expr)
    assert isinstance(body.value, ast.Call)
    assert isinstance(body.value.func, ast.Name)

    function_name = body.value.func.id
    assert body.value.args == []
    assert body.value.keywords is not None

    arguments = {}
    for kw in body.value.keywords:
        try:
            arguments[kw.arg] = ast.literal_eval(kw.value)
        except ValueError as e:
            print(f"Failed to parse {kw.arg}: {e}")
        except SyntaxError as e:
            print(f"Failed to parse {kw.arg}: {e}")

    return {"function": function_name, "arguments": arguments}