"""
Integration tests for the prompt optimisation framework.
"""

import os
import tempfile
import pathlib
from unittest import mock

import pytest

from src.core.experiment import ExperimentRunner
from src.llm.dummy_llm import DummyLLM
from src.evaluation.evaluator import Evaluator


class TestSimpleExperiment:
    """Integration tests for a simple experiment."""

    def setup_method(self):
        """Set up test fixtures."""
        # Create temporary directory for output
        self.temp_dir = tempfile.TemporaryDirectory()
        self.output_dir = pathlib.Path(self.temp_dir.name)

        # Create a test config
        self.config = {
            "project": {
                "name": "test_project",
                "version": "0.1.0",
            },
            "paths": {
                "data": "data",
                "output": str(self.output_dir),
            },
            "llm": {
                "type": "DummyLLM",
                "version_name": "dummy",
                "response": "Yes",
            },
            "prompt_optimiser": {
                "type": "none",
            },
            "evaluation": {
                "type": "default",
            },
        }

    def teardown_method(self):
        """Tear down test fixtures."""
        self.temp_dir.cleanup()

    def test_simple_experiment(self):
        """Test running a simple experiment."""
        # Create an experiment runner
        runner = ExperimentRunner(
            "simple_experiment", config=self.config, output_dir=self.output_dir
        )

        # Define test functions for the experiment steps
        def setup_llm(llm_config):
            """Set up the LLM."""
            return DummyLLM(**llm_config)

        def setup_evaluator():
            """Set up the evaluator."""
            return Evaluator()

        def run_generation(llm):
            """Run the generation step. Simple experiment has no optimisation."""
            base_prompt = "You are a helpful assistant."

            messages = [
                {"role": "system", "content": base_prompt},
                {"role": "user", "content": "Is the sky blue?"},
            ]

            response = llm.generate(messages)
            return {
                "base_prompt": base_prompt,
                "response": response,
            }

        def evaluate_results(generation_results, evaluator):
            """Evaluate the generation results."""
            # In a real scenario, we would have more complex evaluation logic
            # For this test, we'll just check if the response matches the expected value
            expected_response = "Yes"
            score = 1.0 if generation_results["response"] == expected_response else 0.0

            evaluator.register_metric("accuracy", lambda _: score)
            return [score]

        # Add the steps to the experiment
        runner.add_step("setup_llm", setup_llm, llm_config=self.config["llm"])
        runner.add_step("setup_evaluator", setup_evaluator)
        runner.add_step("run_generation", run_generation)
        runner.add_step("evaluate_results", evaluate_results)

        # Mock the _save_results method to avoid file operations
        with mock.patch.object(runner, "_save_results"):
            # Run the experiment
            results = runner.run()

            # Check the results
            assert "setup_llm" in results
            assert isinstance(results["setup_llm"], DummyLLM)

            assert "setup_evaluator" in results
            assert isinstance(results["setup_evaluator"], Evaluator)

            assert "run_generation" in results
            assert (
                results["run_generation"]["base_prompt"]
                == "You are a helpful assistant."
            )
            assert results["run_generation"]["response"] == "Yes"

            assert "evaluate_results" in results
            assert results["evaluate_results"] == [1.0]

            # Check that the metrics were updated
            assert "duration" in runner.metrics
