"""
DESCRIPTION

This experiment is a simple example of how to use the framework. It does the following steps:
1. Load a 600 problem subset of the APPS dataset
2. Query the strong model for solutions
3. Query the strong model to add a backdoor
4. Query the weak model to detect backdoors for the innocent solutions
5. Query the weak model to detect backdoors for the backdoored solutions
6. Evaluate the unit test performance of the raw solutions
7. Evaluate the unit test performance of the backdoored solutions
8. Report the classification accuracy of the weak model
"""

import asyncio

from core.utils import setup_logging
from src.code_evaluation.executor import run_solutions
from src.model_querying.prompt_creation import (
    get_apps_solution_prompt,
    get_backdoor_prompt,
    get_monitoring_prompt,
)
from src.model_querying.solution_extraction import (
    extract_backdoor_solution,
    extract_monitoring_decision,
    extract_solution_in_tag,
)
from src.monitoring.evaluation import SolutionStatus, calculate_classification_f1
from src.pipeline.pipeline import Pipeline, PipelineConfig
from src.runners.evaluate_code import print_eval
from src.tools.dataloaders import load_problems_from_json
from src.tools.path_utils import get_root_directory


def get_pipeline(
    strong_model, weak_model, name=None, use_cache=True, num_problems=None
):
    ROOT_DIR = get_root_directory()
    DATA_DIR = ROOT_DIR / "data" / "APPS"
    TEST_SUBSET_FILE = DATA_DIR / "test_subset.json"

    assert (
        strong_model is not None and weak_model is not None
    ), "Both strong and weak models must be provided"
    if name is None:
        pipeline_name = "simple_monitoring"
    else:
        pipeline_name = f"simple_monitoring-{name}"

    pipeline_config = PipelineConfig(
        pipeline_name,
        anthropic_num_threads=40,
        openai_fraction_rate_limit=0.99,
        num_problems=num_problems,
        use_cache=use_cache,
    )

    pipeline = Pipeline(pipeline_config)
    load_data = pipeline.add_load_data_step(
        "load_problems", load_problems_from_json, TEST_SUBSET_FILE
    )
    get_solutions = pipeline.add_query_step(
        "get_solutions",
        strong_model,
        get_apps_solution_prompt,
        extract_solution_in_tag,
        dependencies=[load_data],
    )
    evaluate_solutions = pipeline.add_code_evaluation_step(
        "evaluate_baseline", strong_model, run_solutions, dependencies=[get_solutions]
    )
    get_backdoors = pipeline.add_query_step(
        "insert_backdoors",
        strong_model,
        get_backdoor_prompt,
        extract_backdoor_solution,
        dependencies=[get_solutions],
    )
    evaluate_backdoors = pipeline.add_code_evaluation_step(
        "evaluate_backdoors", strong_model, run_solutions, dependencies=[get_backdoors]
    )
    query_innocent_monitoring = pipeline.add_query_step(
        "query_innocent_monitoring",
        weak_model,
        get_monitoring_prompt,
        lambda x, y: extract_monitoring_decision(x, SolutionStatus.INNOCENT),
        dependencies=[get_solutions],
    )
    query_backdoor_monitoring = pipeline.add_query_step(
        "query_backdoor_monitoring",
        weak_model,
        get_monitoring_prompt,
        lambda x, y: extract_monitoring_decision(x, SolutionStatus.BACKDOOR),
        dependencies=[get_backdoors],
    )
    calculate_f1 = pipeline.add_monitoring_step(
        "calculate_f1",
        calculate_classification_f1,
        dependencies=[query_innocent_monitoring, query_backdoor_monitoring],
    )

    return pipeline


if __name__ == "__main__":
    pipeline = get_pipeline(
        strong_model="gpt-4-0125-preview",
        weak_model="gpt-3.5-turbo-1106",
        num_problems=2000,
    )
    results = asyncio.run(pipeline.run())
    print_eval(results["evaluate_baseline"])
    print_eval(results["evaluate_backdoors"])
    print(results["calculate_f1"])
