"""Unit tests for run_evaluation function."""

import pytest
from unittest.mock import Mock, patch, MagicMock
from src.evaluators.run_evaluation import (
    run_evaluation, 
    _extract_solutions,
    _calculate_feasibility_score,
    _calculate_utility_score,
    _calculate_novelty_score,
    _cosine_similarity,
    _calculate_creativity_score
)
from src.data_models.task_config import TaskConfig
from src.data_models.evaluation_result import EvaluationResult
from typing import List, Dict, Any


class TestRunEvaluation:
    """Test cases for run_evaluation function."""
    
    def test_run_evaluation_success(self):
        """Test successful evaluation run."""
        # Mock TaskConfig
        task_config = TaskConfig(
            feasibility_check_points=["Check 1", "Check 2"],
            task_description="Test task",
            known_solutions=["Solution 1", "Solution 2"],
            calibration_anchors=["Anchor 1", "Anchor 2"]
        )
        
        # Mock LLM client
        mock_client = Mock()
        
        with patch('src.evaluators.run_evaluation.LLMAPIClient', return_value=mock_client):
            # Mock solution extraction
            mock_client.call_llm_model.return_value = '["Canonicalized Solution A", "Canonicalized Solution B"]'
            
            # Mock scoring functions
            with patch('src.evaluators.run_evaluation._extract_solutions') as mock_extract:
                mock_extract.return_value = [
                    ("sol_1_abc123", "Canonicalized Solution A"),
                    ("sol_2_def456", "Canonicalized Solution B")
                ]
                
                with patch('src.evaluators.run_evaluation._calculate_feasibility_score') as mock_feasibility:
                    mock_feasibility.return_value = (0.8, "Good feasibility")
                    
                    with patch('src.evaluators.run_evaluation._calculate_utility_score') as mock_utility:
                        mock_utility.return_value = (0.9, "Good utility")
                        
                        with patch('src.evaluators.run_evaluation._calculate_novelty_score') as mock_novelty:
                            mock_novelty.return_value = (0.7, "Good novelty")
                            
                            # Run evaluation
                            results = run_evaluation("test solution", task_config, "test_run_id", 3)
                            
                            # Verify results
                            assert len(results) == 2
                            assert isinstance(results[0], EvaluationResult)
                            assert results[0].original_solution_id == "sol_1_abc123"
                            assert results[0].individual_solution_text == "Canonicalized Solution A"
                            assert results[0].feasibility_score == 0.8
                            assert results[0].utility_score == 0.9
                            assert results[0].novelty_score == 0.7
                            assert results[0].creativity_score == 0.79  # (0.8*0.3 + 0.9*0.3 + 0.7*0.4) = 0.79
                            
                            # Verify second result
                            assert results[1].original_solution_id == "sol_2_def456"
                            assert results[1].individual_solution_text == "Canonicalized Solution B"
                            assert results[1].feasibility_score == 0.8
                            assert results[1].utility_score == 0.9
                            assert results[1].novelty_score == 0.7
                            assert results[1].creativity_score == 0.79
    
    def test_run_evaluation_invalid_inputs(self):
        """Test evaluation with invalid inputs."""
        task_config = TaskConfig(
            feasibility_check_points=[],
            task_description="Test task",
            known_solutions=[]
        )
        
        # Test invalid solution_text
        with pytest.raises(ValueError, match="solution_text must be a string"):
            run_evaluation(123, task_config, "test_run_id", 3)
        
        # Test invalid task_config
        with pytest.raises(ValueError, match="task_config must be a TaskConfig object"):
            run_evaluation("test", "invalid_config", "test_run_id", 3)
        
        # Test empty solution_text
        with pytest.raises(ValueError, match="solution_text cannot be empty"):
            run_evaluation("", task_config, "test_run_id", 3)
    
    def test_run_evaluation_no_solutions_extracted(self):
        """Test evaluation when no solutions can be extracted."""
        task_config = TaskConfig(
            feasibility_check_points=[],
            task_description="Test task",
            known_solutions=[]
        )
        
        mock_client = Mock()
        
        with patch('src.evaluators.run_evaluation.LLMAPIClient', return_value=mock_client):
            with patch('src.evaluators.run_evaluation._extract_solutions') as mock_extract:
                mock_extract.return_value = []
                
                with pytest.raises(ValueError, match="No valid solutions could be extracted"):
                    run_evaluation("test solution", task_config, "test_run_id", 3)


class TestExtractSolutions:
    """Test cases for _extract_solutions function."""
    
    def test_extract_solutions_success(self):
        """Test successful solution extraction and canonicalization."""
        mock_client = Mock()
        mock_client.call_gemini.return_value = '["Canonicalized solution 1 description in a couple of sentences", "Canonicalized solution 2 description in a couple of sentences"]'
        
        solutions = _extract_solutions(mock_client, "test text", 3)
        
        # Verify LLM call parameters
        mock_client.call_gemini.assert_called_once()
        call_args = mock_client.call_gemini.call_args
        assert call_args.kwargs['model_name'] == "gemini-2.5-pro"
        assert call_args.kwargs['temperature'] == 0.0
        # Verify the prompt contains canonicalization instructions
        prompt = call_args.kwargs['prompt']
        assert "canonicalize" in prompt.lower()
        assert "descriptive paragraph" in prompt.lower()
        
        assert len(solutions) == 2
        assert solutions[0][0].startswith("sol_1_")
        assert solutions[0][1] == "Canonicalized solution 1 description in a couple of sentences"
        assert solutions[1][0].startswith("sol_2_")
        assert solutions[1][1] == "Canonicalized solution 2 description in a couple of sentences"
    
    def test_extract_solutions_invalid_json(self):
        """Test solution extraction with invalid JSON response."""
        mock_client = Mock()
        mock_client.call_gemini.return_value = "invalid json"
        
        with pytest.raises(Exception, match="Failed to parse LLM response as JSON"):
            _extract_solutions(mock_client, "test text", 3)
    
    def test_extract_solutions_non_list_response(self):
        """Test solution extraction with non-list response."""
        mock_client = Mock()
        mock_client.call_gemini.return_value = '"not a list"'
        
        with pytest.raises(Exception, match="LLM response is not a list"):
            _extract_solutions(mock_client, "test text", 3)
    
    def test_extract_solutions_with_num_final_solutions_limit(self):
        """Test solution extraction with num_final_solutions limit."""
        mock_client = Mock()
        mock_client.call_gemini.return_value = '["Canonicalized solution 1", "Canonicalized solution 2", "Canonicalized solution 3", "Canonicalized solution 4", "Canonicalized solution 5"]'
        
        # Test with limit of 2
        solutions = _extract_solutions(mock_client, "test text", 2)
        
        assert len(solutions) == 2
        assert solutions[0][1] == "Canonicalized solution 1"
        assert solutions[1][1] == "Canonicalized solution 2"
        
        # Test with limit of 3
        solutions = _extract_solutions(mock_client, "test text", 3)
        
        assert len(solutions) == 3
        assert solutions[0][1] == "Canonicalized solution 1"
        assert solutions[1][1] == "Canonicalized solution 2"
        assert solutions[2][1] == "Canonicalized solution 3"
    
    def test_extract_solutions_malformed_json(self):
        """Test solution extraction with malformed JSON response."""
        mock_client = Mock()
        mock_client.call_gemini.return_value = '{"not": "a list"}'
        
        with pytest.raises(Exception, match="LLM response is not a list"):
            _extract_solutions(mock_client, "test text", 3)


class TestScoringFunctions:
    """Test cases for scoring functions."""
    
    def test_calculate_feasibility_score_success(self):
        """Test successful feasibility score calculation."""
        mock_client = Mock()
        mock_client.call_gemini.return_value = '{"score": 0.8, "reasoning": "Good feasibility"}'
        
        evaluation_llm_interactions = []
        score, reasoning = _calculate_feasibility_score(mock_client, "test solution", ["Check 1", "Check 2"], evaluation_llm_interactions)
        
        assert score == 0.8
        assert reasoning == "Good feasibility"
    
    def test_calculate_feasibility_score_fallback(self):
        """Test feasibility score calculation fallback."""
        mock_client = Mock()
        mock_client.call_gemini.side_effect = Exception("API error")
        
        evaluation_llm_interactions = []
        score, reasoning = _calculate_feasibility_score(mock_client, "test solution", ["Check 1"], evaluation_llm_interactions)
        
        assert score == 0.5  # Default fallback score
        assert "LLM call failed" in reasoning
    
    def test_calculate_utility_score_success(self):
        """Test successful utility score calculation."""
        mock_client = Mock()
        # Mock LLM response for utility evaluation
        mock_client.call_gemini.return_value = '{"score": 8, "reasoning": "Good solution"}'
        
        # Create TaskConfig with optimal solutions
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            optimal_solutions_description=["Optimal solution description for reference"]
        )
        
        evaluation_llm_interactions = []
        score, reasoning = _calculate_utility_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Should return the parsed score (8/10 = 0.8) and reasoning
        assert score == 0.8
        assert reasoning == "Good solution"
        assert len(evaluation_llm_interactions) == 1
        
        # Verify that the prompt includes optimal solution reference
        captured_prompt = evaluation_llm_interactions[0]["prompt"]
        assert "**Optimal Solution Reference (Score 10 - Maximum Utility):**" in captured_prompt
        assert "Optimal solution description for reference" in captured_prompt
        # Verify that calibration anchors are NOT present
        assert "**Calibration Anchors (Examples of pre-scored solutions):**" not in captured_prompt
    
    def test_calculate_utility_score_without_optimal_solutions(self):
        """Test utility score calculation without optimal solutions."""
        mock_client = Mock()
        # Mock LLM response for utility evaluation
        mock_client.call_gemini.return_value = '{"score": 6, "reasoning": "Decent solution"}'
        
        # Create TaskConfig without optimal solutions
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            optimal_solutions_description=[]
        )
        
        evaluation_llm_interactions = []
        score, reasoning = _calculate_utility_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Should return the parsed score (6/10 = 0.6) and reasoning
        assert score == 0.6
        assert reasoning == "Decent solution"
        assert len(evaluation_llm_interactions) == 1
        
        # Verify that the prompt does NOT include optimal solution reference section
        captured_prompt = evaluation_llm_interactions[0]["prompt"]
        assert "**Optimal Solution Reference (Score 10 - Maximum Utility):**" not in captured_prompt
    
    def test_calculate_novelty_score_success(self):
        """Test successful novelty score calculation."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme", "known theme 1", "known theme 2"]
        # Mock embedding calls
        mock_client.embed_content.return_value = [
            [0.1, 0.2, 0.3],  # Current solution theme
            [0.4, 0.5, 0.6],  # Known solution theme 1
            [0.7, 0.8, 0.9]   # Known solution theme 2
        ]
        
        # Create TaskConfig with known solutions
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"]
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Should be inverse of maximum similarity
        assert 0.0 <= score <= 1.0
        assert theme == "current theme"
    
    def test_calculate_novelty_score_fallback(self):
        """Test novelty score calculation fallback."""
        mock_client = Mock()
        mock_client.embed_content.side_effect = Exception("Embedding error")
        
        # Create TaskConfig with known solutions
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1"]
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        assert score == 0.5  # Default fallback score
        assert theme == ""
    
    def test_calculate_novelty_score_with_optimal_solutions_concept(self):
        """Test novelty score calculation with new normalization logic using optimal solutions."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme"]
        
        # Mock embeddings for initial novelty calculation (current theme + known concepts)
        initial_embeddings = [
            [0.1, 0.2, 0.3],  # Current solution theme
            [0.4, 0.5, 0.6],  # Known solution concept 1
            [0.7, 0.8, 0.9]   # Known solution concept 2
        ]
        
        # Mock embeddings for optimal concepts
        optimal_embeddings = [
            [0.2, 0.3, 0.4],  # Optimal solution concept 1
            [0.5, 0.6, 0.7]   # Optimal solution concept 2
        ]
        
        # Set up side effects for multiple embed_content calls
        mock_client.embed_content.side_effect = [initial_embeddings, optimal_embeddings]
        
        # Create TaskConfig with both known and optimal concepts
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            known_solutions_concept=["concept1", "concept2"],
            optimal_solutions_concept=["optimal1", "optimal2"]
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Verify the score is within valid range
        assert 0.0 <= score <= 1.0
        assert theme == "current theme"
        
        # Verify that embedding was called twice (initial + optimal)
        assert mock_client.embed_content.call_count == 2
        
        # Verify first call includes current theme + known concepts
        first_call_args = mock_client.embed_content.call_args_list[0][0]
        first_themes = first_call_args[1]
        assert "current theme" in first_themes
        assert "concept1" in first_themes
        assert "concept2" in first_themes
        
        # Verify second call includes optimal concepts
        second_call_args = mock_client.embed_content.call_args_list[1][0]
        second_themes = second_call_args[1]
        assert "optimal1" in second_themes
        assert "optimal2" in second_themes
    
    def test_calculate_novelty_score_fallback_to_known_solutions_concept(self):
        """Test novelty score calculation falling back to known_solutions_concept when optimal_solutions_concept is empty."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme"]
        # Mock embedding calls
        mock_client.embed_content.return_value = [
            [0.1, 0.2, 0.3],  # Current solution theme
            [0.4, 0.5, 0.6],  # Known solution concept theme 1
            [0.7, 0.8, 0.9]   # Known solution concept theme 2
        ]
        
        # Create TaskConfig with empty optimal_solutions_concept
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            known_solutions_concept=["concept1", "concept2"],
            optimal_solutions_concept=[]  # Empty list
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Should fall back to known_solutions_concept (concept1, concept2)
        assert 0.0 <= score <= 1.0
        assert theme == "current theme"
        
        # Verify that embedding was called with the concept themes, not the regular known solutions
        mock_client.embed_content.assert_called_once()
        call_args = mock_client.embed_content.call_args[0]
        themes_used = call_args[1]  # Second argument is the list of themes
        assert "concept1" in themes_used
        assert "concept2" in themes_used
        assert "known1" not in themes_used
        assert "known2" not in themes_used
    
    def test_calculate_novelty_score_fallback_to_known_solutions(self):
        """Test novelty score calculation falling back to known_solutions when known_solutions_concept is empty."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme", "known theme 1", "known theme 2"]
        # Mock embedding calls
        mock_client.embed_content.return_value = [
            [0.1, 0.2, 0.3],  # Current solution theme
            [0.4, 0.5, 0.6],  # Known solution theme 1
            [0.7, 0.8, 0.9]   # Known solution theme 2
        ]
        
        # Create TaskConfig with empty known_solutions_concept
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            known_solutions_concept=[]  # Empty list
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Should fall back to known_solutions (known1, known2)
        assert 0.0 <= score <= 1.0
        assert theme == "current theme"
        
        # Verify that embedding was called with the known solutions, not the empty concept list
        mock_client.embed_content.assert_called_once()
        call_args = mock_client.embed_content.call_args[0]
        themes_used = call_args[1]  # Second argument is the list of themes
        assert "known1" in themes_used
        assert "known2" in themes_used
    
    def test_calculate_novelty_score_both_empty(self):
        """Test novelty score calculation when both known_solutions and known_solutions_concept are empty."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme"]
        # Mock embedding calls
        mock_client.embed_content.return_value = [
            [0.1, 0.2, 0.3]  # Current solution theme only
        ]
        
        # Create TaskConfig with both lists empty
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=[],
            known_solutions_concept=[]
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Should still work with just the current theme
        assert 0.0 <= score <= 1.0
        assert theme == "current theme"
        
        # Verify that embedding was called with just the current theme
        mock_client.embed_content.assert_called_once()
        call_args = mock_client.embed_content.call_args[0]
        themes_used = call_args[1]  # Second argument is the list of themes
        assert len(themes_used) == 1
        assert themes_used[0] == "current theme"
    
    def test_calculate_novelty_score_new_normalization_logic(self):
        """Test the new normalization logic using known-optimal distance scaling."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme"]
        
        # Mock embeddings for initial novelty calculation (current theme + known concepts)
        initial_embeddings = [
            [1.0, 0.0, 0.0],  # Current solution theme
            [0.8, 0.6, 0.0],  # Known solution concept 1
            [0.6, 0.8, 0.0]   # Known solution concept 2
        ]
        
        # Mock embeddings for optimal concepts
        optimal_embeddings = [
            [0.0, 1.0, 0.0],  # Optimal solution concept 1
            [0.0, 0.0, 1.0]   # Optimal solution concept 2
        ]
        
        # Set up side effects for multiple embed_content calls
        mock_client.embed_content.side_effect = [initial_embeddings, optimal_embeddings]
        
        # Create TaskConfig with both known and optimal concepts
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            known_solutions_concept=["known_concept1", "known_concept2"],
            optimal_solutions_concept=["optimal_concept1", "optimal_concept2"]
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Verify the score is within valid range
        assert 0.0 <= score <= 1.0
        assert theme == "current theme"
        
        # Verify that embedding was called twice (initial + optimal)
        assert mock_client.embed_content.call_count == 2
        
        # Verify first call includes current theme + known concepts
        first_call_args = mock_client.embed_content.call_args_list[0][0]
        first_themes = first_call_args[1]
        assert "current theme" in first_themes
        assert "known_concept1" in first_themes
        assert "known_concept2" in first_themes
        
        # Verify second call includes optimal concepts
        second_call_args = mock_client.embed_content.call_args_list[1][0]
        second_themes = second_call_args[1]
        assert "optimal_concept1" in second_themes
        assert "optimal_concept2" in second_themes
    
    def test_calculate_novelty_score_division_by_zero_handling(self):
        """Test novelty score calculation when all distances are 0 (perfect similarity)."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme"]
        
        # Create embeddings that will result in identical vectors (perfect similarity)
        # All distances will be 0, causing division by zero
        mock_client.embed_content.return_value = [
            [1.0, 0.0, 0.0],  # Current solution theme
            [1.0, 0.0, 0.0],  # Optimal solution concept theme 1 (identical)
            [1.0, 0.0, 0.0]   # Optimal solution concept theme 2 (identical)
        ]
        
        # Create TaskConfig with optimal_solutions_concept
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            optimal_solutions_concept=["optimal1", "optimal2"]
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # When max_distance is 0, should return 0.0
        assert score == 0.0
        assert theme == "current theme"
    
    def test_calculate_novelty_score_without_optimal_solutions(self):
        """Test novelty score calculation when optimal_solutions_concept is empty."""
        mock_client = Mock()
        # Mock theme extraction calls
        mock_client.call_gemini.side_effect = ["current theme"]
        # Mock embedding calls - should include current theme + known solutions
        mock_client.embed_content.return_value = [
            [0.1, 0.2, 0.3],  # Current solution theme
            [0.4, 0.5, 0.6],  # Known solution 1
            [0.7, 0.8, 0.9]   # Known solution 2
        ]
        
        # Create TaskConfig with empty optimal_solutions_concept
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="test task",
            known_solutions=["known1", "known2"],
            optimal_solutions_concept=[]  # Empty list
        )
        
        evaluation_llm_interactions = []
        score, theme = _calculate_novelty_score(mock_client, "test solution", task_config, evaluation_llm_interactions)
        
        # Should fall back to known_solutions and calculate normal novelty score
        assert 0.0 <= score <= 1.0
        assert theme == "current theme"
        
        # Should only call embed_content once (no optimal solutions to embed)
        assert mock_client.embed_content.call_count == 1
    
    def test_cosine_similarity_success(self):
        """Test successful cosine similarity calculation."""
        vec1 = [1.0, 0.0, 0.0]
        vec2 = [1.0, 0.0, 0.0]
        
        similarity = _cosine_similarity(vec1, vec2)
        
        assert similarity == 1.0  # Identical vectors
    
    def test_cosine_similarity_orthogonal(self):
        """Test cosine similarity for orthogonal vectors."""
        vec1 = [1.0, 0.0, 0.0]
        vec2 = [0.0, 1.0, 0.0]
        
        similarity = _cosine_similarity(vec1, vec2)
        
        assert similarity == 0.0  # Orthogonal vectors
    
    def test_calculate_creativity_score(self):
        """Test creativity score calculation."""
        score = _calculate_creativity_score(0.8, 0.9, 0.7)
        
        # Weighted average: 0.8*0.3 + 0.9*0.3 + 0.7*0.4 = 0.79
        expected = 0.79
        assert abs(score - expected) < 0.001


class TestRunEvaluationPreExtractedSolutions:
    """Test cases for run_evaluation with pre-extracted solutions."""
    
    def test_run_evaluation_with_pre_extracted_solutions(self):
        """Test run_evaluation with pre-extracted solutions bypassing LLM extraction."""
        # Mock TaskConfig
        task_config = TaskConfig(
            feasibility_check_points=["Check 1", "Check 2"],
            task_description="Test task",
            known_solutions=["Solution 1", "Solution 2"],
            calibration_anchors=["Anchor 1", "Anchor 2"]
        )
        
        # Mock LLM client
        mock_client = Mock()
        
        # Pre-extracted solutions with themes
        pre_extracted_solutions = [
            {
                'solution_text': 'Pre-extracted solution 1',
                'original_solution_id': 'sol_1_pre',
                'novelty_theme': 'Pre-existing theme 1'
            },
            {
                'solution_text': 'Pre-extracted solution 2',
                'original_solution_id': 'sol_2_pre',
                'novelty_theme': 'Pre-existing theme 2'
            }
        ]
        
        with patch('src.evaluators.run_evaluation.LLMAPIClient', return_value=mock_client):
            # Mock scoring functions
            with patch('src.evaluators.run_evaluation._calculate_feasibility_score') as mock_feasibility:
                mock_feasibility.return_value = (0.8, "Good feasibility")
                
                with patch('src.evaluators.run_evaluation._calculate_utility_score') as mock_utility:
                    mock_utility.return_value = (0.9, "Good utility")
                    
                    with patch('src.evaluators.run_evaluation._calculate_novelty_score') as mock_novelty:
                        mock_novelty.return_value = (0.7, "Pre-existing theme 1")
                        
                        with patch('src.evaluators.run_evaluation._save_evaluation_intermediate_logs') as mock_save_logs:
                            mock_save_logs.return_value = "test_log.json"
                            
                            # Run evaluation with pre-extracted solutions
                            results = run_evaluation(
                                solution_text="Original solution text",
                                task_config=task_config,
                                run_id="test_run_id",
                                num_final_solutions=3,
                                pre_extracted_solutions_with_themes=pre_extracted_solutions
                            )
                            
                            # Verify results
                            assert len(results) == 2
                            assert results[0].individual_solution_text == 'Pre-extracted solution 1'
                            assert results[0].original_solution_id == 'sol_1_pre'
                            assert results[1].individual_solution_text == 'Pre-extracted solution 2'
                            assert results[1].original_solution_id == 'sol_2_pre'
                            
                            # Verify that _extract_solutions was not called (LLM extraction bypassed)
                            # This is implicit - if the test passes, it means pre-extracted solutions were used
    
    def test_run_evaluation_with_pre_extracted_solutions_no_theme(self):
        """Test run_evaluation with pre-extracted solutions but no novelty themes."""
        # Mock TaskConfig
        task_config = TaskConfig(
            feasibility_check_points=["Check 1", "Check 2"],
            task_description="Test task",
            known_solutions=["Solution 1", "Solution 2"],
            calibration_anchors=["Anchor 1", "Anchor 2"]
        )
        
        # Mock LLM client
        mock_client = Mock()
        
        # Pre-extracted solutions without themes
        pre_extracted_solutions = [
            {
                'solution_text': 'Pre-extracted solution 1',
                'original_solution_id': 'sol_1_pre'
                # No novelty_theme
            }
        ]
        
        with patch('src.evaluators.run_evaluation.LLMAPIClient', return_value=mock_client):
            # Mock scoring functions
            with patch('src.evaluators.run_evaluation._calculate_feasibility_score') as mock_feasibility:
                mock_feasibility.return_value = (0.8, "Good feasibility")
                
                with patch('src.evaluators.run_evaluation._calculate_utility_score') as mock_utility:
                    mock_utility.return_value = (0.9, "Good utility")
                    
                    with patch('src.evaluators.run_evaluation._calculate_novelty_score') as mock_novelty:
                        mock_novelty.return_value = (0.7, "LLM extracted theme")
                        
                        with patch('src.evaluators.run_evaluation._save_evaluation_intermediate_logs') as mock_save_logs:
                            mock_save_logs.return_value = "test_log.json"
                            
                            # Run evaluation with pre-extracted solutions
                            results = run_evaluation(
                                solution_text="Original solution text",
                                task_config=task_config,
                                run_id="test_run_id",
                                num_final_solutions=3,
                                pre_extracted_solutions_with_themes=pre_extracted_solutions
                            )
                            
                            # Verify results
                            assert len(results) == 1
                            assert results[0].individual_solution_text == 'Pre-extracted solution 1'
                            assert results[0].original_solution_id == 'sol_1_pre'
    
    def test_run_evaluation_with_empty_pre_extracted_solutions(self):
        """Test run_evaluation with empty pre-extracted solutions falls back to LLM extraction."""
        # Mock TaskConfig
        task_config = TaskConfig(
            feasibility_check_points=["Check 1", "Check 2"],
            task_description="Test task",
            known_solutions=["Solution 1", "Solution 2"],
            calibration_anchors=["Anchor 1", "Anchor 2"]
        )
        
        # Mock LLM client
        mock_client = Mock()
        
        with patch('src.evaluators.run_evaluation.LLMAPIClient', return_value=mock_client):
            # Mock solution extraction (should be called when pre_extracted_solutions is empty)
            with patch('src.evaluators.run_evaluation._extract_solutions') as mock_extract:
                mock_extract.return_value = [
                    ("sol_1_abc123", "LLM extracted solution")
                ]
                
                # Mock scoring functions
                with patch('src.evaluators.run_evaluation._calculate_feasibility_score') as mock_feasibility:
                    mock_feasibility.return_value = (0.8, "Good feasibility")
                    
                    with patch('src.evaluators.run_evaluation._calculate_utility_score') as mock_utility:
                        mock_utility.return_value = (0.9, "Good utility")
                        
                        with patch('src.evaluators.run_evaluation._calculate_novelty_score') as mock_novelty:
                            mock_novelty.return_value = (0.7, "LLM extracted theme")
                            
                            with patch('src.evaluators.run_evaluation._save_evaluation_intermediate_logs') as mock_save_logs:
                                mock_save_logs.return_value = "test_log.json"
                                
                                # Run evaluation with empty pre-extracted solutions
                                results = run_evaluation(
                                    solution_text="Original solution text",
                                    task_config=task_config,
                                    run_id="test_run_id",
                                    num_final_solutions=3,
                                    pre_extracted_solutions_with_themes=[]
                                )
                                
                                # Verify that _extract_solutions was called (fallback to LLM)
                                mock_extract.assert_called_once()
                                
                                # Verify results
                                assert len(results) == 1
                                assert results[0].individual_solution_text == 'LLM extracted solution'


class TestCalculateNoveltyScorePreExistingTheme:
    """Test cases for _calculate_novelty_score with pre-existing themes."""
    
    def test_calculate_novelty_score_with_pre_existing_theme(self):
        """Test _calculate_novelty_score using pre-existing theme."""
        # Mock TaskConfig
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="Test task",
            known_solutions=["Solution 1", "Solution 2"],
            known_solutions_concept=["Concept 1", "Concept 2"],
            calibration_anchors=["Anchor 1"]
        )
        
        # Mock LLM client
        mock_client = Mock()
        
        # Mock embedding generation
        mock_client.embed_content.return_value = [
            [0.1, 0.2, 0.3],  # Current theme embedding
            [0.4, 0.5, 0.6],  # Known concept 1 embedding
            [0.7, 0.8, 0.9]   # Known concept 2 embedding
        ]
        
        evaluation_llm_interactions = []
        
        # Test with pre-existing theme
        score, theme = _calculate_novelty_score(
            llm_client=mock_client,
            solution_text="Test solution",
            task_config=task_config,
            evaluation_llm_interactions=evaluation_llm_interactions,
            pre_existing_novelty_theme="Pre-existing theme"
        )
        
        # Verify theme was used
        assert theme == "Pre-existing theme"
        
        # Verify audit log entry was added
        assert len(evaluation_llm_interactions) == 1
        audit_entry = evaluation_llm_interactions[0]
        assert audit_entry["step_name"] == "Evaluation: Novelty Key Phrase Extraction (Skipped - Pre-existing)"
        assert audit_entry["llm_model_name"] == "N/A"
        assert audit_entry["parsed_output"] == "Pre-existing novelty theme used"
        
        # Verify LLM call for theme extraction was not made
        mock_client.call_gemini.assert_not_called()
    
    def test_calculate_novelty_score_with_empty_pre_existing_theme(self):
        """Test _calculate_novelty_score with empty pre-existing theme falls back to LLM."""
        # Mock TaskConfig
        task_config = TaskConfig(
            feasibility_check_points=["Check 1"],
            task_description="Test task",
            known_solutions=["Solution 1", "Solution 2"],
            known_solutions_concept=["Concept 1", "Concept 2"],
            calibration_anchors=["Anchor 1"]
        )
        
        # Mock LLM client
        mock_client = Mock()
        mock_client.call_gemini.return_value = "LLM extracted theme"
        
        # Mock embedding generation
        mock_client.embed_content.return_value = [
            [0.1, 0.2, 0.3],  # Current theme embedding
            [0.4, 0.5, 0.6],  # Known concept 1 embedding
            [0.7, 0.8, 0.9]   # Known concept 2 embedding
        ]
        
        evaluation_llm_interactions = []
        
        # Test with empty pre-existing theme
        score, theme = _calculate_novelty_score(
            llm_client=mock_client,
            solution_text="Test solution",
            task_config=task_config,
            evaluation_llm_interactions=evaluation_llm_interactions,
            pre_existing_novelty_theme=""
        )
        
        # Verify LLM was called for theme extraction
        mock_client.call_gemini.assert_called_once()
        
        # Verify theme was extracted by LLM
        assert theme == "LLM extracted theme"
        
        # Verify audit log entry was added for LLM call
        assert len(evaluation_llm_interactions) == 1
        audit_entry = evaluation_llm_interactions[0]
        assert "prompt" in audit_entry
        assert "raw_response" in audit_entry
        assert audit_entry["model"] == "gemini-2.5-pro"
