"""
LLM Fine-tuning Experiment Feedback Generation

Provides feedback analysis for LLM fine-tuning experiments, including
model performance evaluation, training metrics analysis, and improvement suggestions.
"""

import json
from typing import Dict

from ftagent.app.finetune.llm.conf import FT_RD_SETTING
from ftagent.core.proposal import (
    Experiment2Feedback,
    ExperimentFeedback,
    HypothesisFeedback,
)
from ftagent.core.scenario import Scenario
from ftagent.log import ftagent_logger as logger
from ftagent.log.utils import dict_get_with_warning
from ftagent.oai.llm_utils import APIBackend
from ftagent.scenarios.finetune.experiment.experiment import FTExperiment
from ftagent.scenarios.finetune.proposal.proposal import FTHypothesis
from ftagent.scenarios.finetune.proposal.trace import FTTrace
from ftagent.utils import convert2bool
from ftagent.utils.agent.tpl import T


class FTExperiment2Feedback(Experiment2Feedback):
    """Generate feedback for LLM fine-tuning experiments"""

    def __init__(self, scen: Scenario, version: str = "exp_feedback") -> None:
        super().__init__(scen)
        self.version = version

    def generate_feedback(
        self, exp: FTExperiment, trace: FTTrace | None = None, exception: Exception | None = None
    ) -> ExperimentFeedback:
        """
        Generate comprehensive feedback for LLM fine-tuning experiment.

        Args:
            exp: The experiment to analyze
            trace: Experiment trace (optional)
            exception: If provided, indicates experiment failed and contains error details

        Note: If exception is None, it means training succeeded and we evaluate quality/effectiveness.
              If exception is provided, we analyze the failure cause.
        """
        # Get task information
        task_desc = exp.sub_tasks[0].get_task_information()

        # Initialize for SOTA update logic later
        sota_benchmark = None

        if exception is not None:
            # Error case: use error analysis prompt
            version = "exp_feedback_error"
            error_info = str(exception)

            # Try to get FTRunnerEvaluator's analysis result from workspace
            # This contains structured feedback (execution, return_checking, code) instead of raw error string
            runner_feedback = None
            if exp.sub_workspace_list:
                for ws in exp.sub_workspace_list:
                    if ws and hasattr(ws, "feedback") and ws.feedback:
                        runner_feedback = ws.feedback
                        break

            if runner_feedback:
                # Use FTRunnerEvaluator's structured analysis result
                error_info = f"""## Execution Analysis
{runner_feedback.execution}

## Return Checking
{runner_feedback.return_checking}

## Code Analysis
{runner_feedback.code}"""

            system_prompt = T(f".prompts:{version}.system").r(
                scenario=self.scen.get_scenario_all_desc(),
            )
            # Get workspace files safely
            workspace_files = {}
            if hasattr(exp, "experiment_workspace") and exp.experiment_workspace is not None:
                workspace_files = exp.experiment_workspace.file_dict
            user_prompt = T(f".prompts:{version}.user").r(
                hypothesis=exp.hypothesis,
                task_desc=task_desc,
                workspace_files=workspace_files,
                error_info=error_info,
            )
        else:
            # Success case: use normal feedback prompt
            version = self.version
            # Process experiment result - handle both new and legacy formats
            exp_result = exp.experiment_workspace.running_info.result
            if isinstance(exp_result, dict) and "benchmark" in exp_result:
                # New format: contains benchmark and training_metrics
                benchmark = exp_result.get("benchmark", {})
                raw_metrics = exp_result.get("training_metrics", {})
                # Pass loss_history directly (simpler and preserves full information)
                loss_history = raw_metrics.get("loss_history", {"train": [], "eval": []})
                # Sample train entries if too many to avoid token bloat
                if len(loss_history.get("train", [])) > 60:
                    loss_history["train"] = loss_history["train"][:30] + loss_history["train"][-30:]
                training_metrics = {"loss_history": loss_history} if (loss_history.get("train") or loss_history.get("eval")) else {}
            else:
                # Legacy format: exp_result is directly the benchmark result (list of dicts)
                benchmark = {"accuracy_summary": exp_result, "error_samples": []}
                training_metrics = {}

            # Get SOTA experiment's benchmark results for comparison
            sota_benchmark = trace.sota_benchmark() if trace else None

            # Get baseline benchmark (always exists, computed at scenario init)
            baseline_benchmark = getattr(self.scen, "baseline_benchmark_score", None)

            system_prompt = T(f".prompts:{version}.system").r(
                scenario=self.scen.get_scenario_all_desc(),
                has_sota=sota_benchmark is not None,
                force_think_token=FT_RD_SETTING.force_think_token,
            )
            user_prompt = T(f".prompts:{version}.user").r(
                hypothesis=exp.hypothesis,
                task_desc=task_desc,
                workspace_files=exp.experiment_workspace.file_dict,
                execution_time=exp.experiment_workspace.running_info.running_time,
                benchmark=benchmark,
                training_metrics=training_metrics,
                sota_benchmark=sota_benchmark,
                baseline_benchmark=baseline_benchmark,
            )

        resp_dict = json.loads(
            APIBackend().build_messages_and_create_chat_completion(
                user_prompt=user_prompt,
                system_prompt=system_prompt,
                json_mode=True,
                json_target_type=Dict[str, str | bool | int],
            )
        )

        # Extract feedback components
        error_type = resp_dict.get("Error Type") if exception is not None else None
        hypothesis_feedback = HypothesisFeedback(
            code_change_summary=dict_get_with_warning(resp_dict, "Code Summary", "No code summary provided"),
            reason=dict_get_with_warning(resp_dict, "Reason", "No reasoning provided"),
            decision=convert2bool(dict_get_with_warning(resp_dict, "Decision", "no")),
            acceptable=exception is None,  # Only acceptable if no error
            observations=error_type,  # Store error type for history display
        )

        return hypothesis_feedback
