"""Prompt templates for LLM-as-judge evaluation."""

import sys
from pathlib import Path
from typing import TypedDict, Annotated
from langchain_core.prompts import ChatPromptTemplate

# Add project root to path
root = Path(__file__).parent.parent.parent.parent
sys.path.append(str(root))


# Base evaluation scores structure
class ComponentEvaluationScores(TypedDict):
    primary_score: Annotated[int, "Primary evaluation metric score (1-10)"]
    secondary_score: Annotated[int, "Secondary evaluation metric score (1-10)"] 
    tertiary_score: Annotated[int, "Tertiary evaluation metric score (1-10)"]
    overall_quality: Annotated[float, "Weighted average of component scores (1-10)"]
    primary_reasoning: Annotated[str, "Brief explanation for primary score (max 50 words)"]
    secondary_reasoning: Annotated[str, "Brief explanation for secondary score (max 50 words)"]
    tertiary_reasoning: Annotated[str, "Brief explanation for tertiary score (max 50 words)"]


# Component-specific evaluation scores
class AnnotationEvaluationScores(TypedDict):
    accuracy_score: Annotated[int, "Accuracy of annotation vs ground truth (1-10)"]
    completeness_score: Annotated[int, "Completeness of key events coverage (1-10)"]
    clarity_score: Annotated[int, "Clarity for driving analysis (1-10)"]
    overall_quality: Annotated[float, "Overall annotation quality (1-10)"]
    accuracy_reasoning: Annotated[str, "Brief accuracy explanation (max 50 words)"]
    completeness_reasoning: Annotated[str, "Brief completeness explanation (max 50 words)"]
    clarity_reasoning: Annotated[str, "Brief clarity explanation (max 50 words)"]


class SceneEvaluationScores(TypedDict):
    extraction_quality: Annotated[int, "Quality of scene decomposition (1-10)"]
    temporal_coherence: Annotated[int, "Logical temporal ordering (1-10)"]
    safety_relevance: Annotated[int, "Focus on driving safety aspects (1-10)"]
    overall_quality: Annotated[float, "Overall scene extraction quality (1-10)"]
    extraction_reasoning: Annotated[str, "Brief extraction quality explanation (max 50 words)"]
    temporal_reasoning: Annotated[str, "Brief temporal coherence explanation (max 50 words)"]
    safety_reasoning: Annotated[str, "Brief safety relevance explanation (max 50 words)"]


class ViolationEvaluationScores(TypedDict):
    detection_accuracy: Annotated[int, "Accuracy of violation identification (1-10)"]
    explanation_quality: Annotated[int, "Quality of violation reasoning (1-10)"]
    legal_consistency: Annotated[int, "Consistency with traffic law (1-10)"]
    overall_quality: Annotated[float, "Overall violation detection quality (1-10)"]
    detection_reasoning: Annotated[str, "Brief detection accuracy explanation (max 50 words)"]
    explanation_reasoning: Annotated[str, "Brief explanation quality reasoning (max 50 words)"]
    legal_reasoning: Annotated[str, "Brief legal consistency explanation (max 50 words)"]


class AccidentEvaluationScores(TypedDict):
    risk_assessment_accuracy: Annotated[int, "Accuracy of risk evaluation (1-10)"]
    consequence_prediction: Annotated[int, "Quality of outcome prediction (1-10)"]
    context_understanding: Annotated[int, "Consideration of environmental factors (1-10)"]
    overall_quality: Annotated[float, "Overall accident assessment quality (1-10)"]
    risk_reasoning: Annotated[str, "Brief risk assessment explanation (max 50 words)"]
    consequence_reasoning: Annotated[str, "Brief consequence prediction explanation (max 50 words)"]
    context_reasoning: Annotated[str, "Brief context understanding explanation (max 50 words)"]


class AssessmentEvaluationScores(TypedDict):
    assessment_accuracy: Annotated[int, "Alignment with expert evaluation (1-10)"]
    advice_actionability: Annotated[int, "Practical value of improvement suggestions (1-10)"]
    score_justification: Annotated[int, "How well safety score matches evidence (1-10)"]
    overall_quality: Annotated[float, "Overall assessment quality (1-10)"]
    assessment_reasoning: Annotated[str, "Brief assessment accuracy explanation (max 50 words)"]
    advice_reasoning: Annotated[str, "Brief advice actionability explanation (max 50 words)"]
    justification_reasoning: Annotated[str, "Brief score justification explanation (max 50 words)"]


# Base prompt template for all evaluations
BASE_SYSTEM_PROMPT = """You are an expert evaluator for autonomous driving systems. 
You MUST provide quantitative scores (1-10 integers) for each evaluation dimension.

SCORING SCALE:
10 = Excellent (95-100% quality)
9 = Very Good (85-94% quality)  
8 = Good (75-84% quality)
7 = Above Average (65-74% quality)
6 = Average (55-64% quality)
5 = Below Average (45-54% quality)
4 = Poor (35-44% quality)
3 = Very Poor (25-34% quality)
2 = Extremely Poor (15-24% quality)
1 = Unacceptable (0-14% quality)

EVALUATION INSTRUCTIONS:
- Compare system output against ground truth carefully
- Provide integer scores (1-10) for each dimension
- Keep reasoning brief (max 50 words each)
- Be objective and consistent in your scoring
- Focus on the specific component being evaluated"""


# Annotation evaluation prompt
annotation_judge_prompt = ChatPromptTemplate([
    ("system", BASE_SYSTEM_PROMPT + """

You are evaluating ANNOTATION COMPONENT performance. Focus on:
1. ACCURACY: How accurately does the system annotation capture the driving scenario?
2. COMPLETENESS: Does the system annotation include all critical driving events?
3. CLARITY: How clear and understandable is the annotation for driving analysis?"""),
    
    ("user", """Compare the system annotation against ground truth and provide scores:

GROUND TRUTH ANNOTATION:
{ground_truth}

SYSTEM ANNOTATION:
{system_output}

VIDEO ID: {video_id}
MODEL: {model}

Provide integer scores (1-10) for accuracy, completeness, and clarity with brief reasoning.""")
])


# Scene extraction evaluation prompt  
scene_judge_prompt = ChatPromptTemplate([
    ("system", BASE_SYSTEM_PROMPT + """

You are evaluating SCENE EXTRACTION COMPONENT performance. Focus on:
1. EXTRACTION_QUALITY: How well does the system extract distinct, safety-relevant scenes?
2. TEMPORAL_COHERENCE: Are the extracted scenes in logical temporal order?
3. SAFETY_RELEVANCE: How well do scenes focus on driving safety aspects?"""),
    
    ("user", """Compare the system scene extraction against ground truth and provide scores:

GROUND TRUTH SCENES:
{ground_truth}

SYSTEM SCENES:
{system_output}

VIDEO ID: {video_id}
MODEL: {model}

Provide integer scores (1-10) for extraction quality, temporal coherence, and safety relevance with brief reasoning.""")
])


# Violation detection evaluation prompt
violation_judge_prompt = ChatPromptTemplate([
    ("system", BASE_SYSTEM_PROMPT + """

You are evaluating VIOLATION DETECTION COMPONENT performance. Focus on:
1. DETECTION_ACCURACY: How accurately does the system identify traffic violations?
2. EXPLANATION_QUALITY: How well does the system explain why violations occurred?
3. LEGAL_CONSISTENCY: Are the identified violations consistent with traffic law?"""),
    
    ("user", """Compare the system violation detection against ground truth and provide scores:

GROUND TRUTH VIOLATIONS:
{ground_truth}

SYSTEM VIOLATIONS:
{system_output}

VIDEO ID: {video_id}
MODEL: {model}

Provide integer scores (1-10) for detection accuracy, explanation quality, and legal consistency with brief reasoning.""")
])


# Accident assessment evaluation prompt
accident_judge_prompt = ChatPromptTemplate([
    ("system", BASE_SYSTEM_PROMPT + """

You are evaluating ACCIDENT ASSESSMENT COMPONENT performance. Focus on:
1. RISK_ASSESSMENT_ACCURACY: How accurately does the system assess accident risks?
2. CONSEQUENCE_PREDICTION: How well does the system predict potential accident outcomes?
3. CONTEXT_UNDERSTANDING: Does the system consider relevant environmental factors?"""),
    
    ("user", """Compare the system accident assessment against ground truth and provide scores:

GROUND TRUTH ACCIDENTS:
{ground_truth}

SYSTEM ACCIDENTS:
{system_output}

VIDEO ID: {video_id}
MODEL: {model}

Provide integer scores (1-10) for risk assessment accuracy, consequence prediction, and context understanding with brief reasoning.""")
])


# Driving assessment evaluation prompt
assessment_judge_prompt = ChatPromptTemplate([
    ("system", BASE_SYSTEM_PROMPT + """

You are evaluating DRIVING ASSESSMENT COMPONENT performance. Focus on:
1. ASSESSMENT_ACCURACY: How well does the safety assessment match expert evaluation?
2. ADVICE_ACTIONABILITY: How actionable and relevant are the driving improvement suggestions?
3. SCORE_JUSTIFICATION: How well justified is the safety score based on the evidence?"""),
    
    ("user", """Compare the system driving assessment against ground truth and provide scores:

GROUND TRUTH ASSESSMENT:
{ground_truth}

SYSTEM ASSESSMENT:
{system_output}

VIDEO ID: {video_id}
MODEL: {model}

Provide integer scores (1-10) for assessment accuracy, advice actionability, and score justification with brief reasoning.""")
])


# Component prompt mapping
COMPONENT_PROMPTS = {
    'annotation': (annotation_judge_prompt, AnnotationEvaluationScores),
    'scene': (scene_judge_prompt, SceneEvaluationScores),
    'violation': (violation_judge_prompt, ViolationEvaluationScores),
    'accident': (accident_judge_prompt, AccidentEvaluationScores),
    'assessment': (assessment_judge_prompt, AssessmentEvaluationScores)
}


def get_component_prompt(component: str):
    """Get prompt template and score class for a component.
    
    Args:
        component: Component name
        
    Returns:
        Tuple of (prompt_template, score_class)
        
    Raises:
        ValueError: If component is not supported
    """
    if component not in COMPONENT_PROMPTS:
        raise ValueError(f"Unsupported component: {component}. Supported: {list(COMPONENT_PROMPTS.keys())}")
    
    return COMPONENT_PROMPTS[component]