"""
Comprehensive benchmark framework for evaluating LLM's capability on generating personalized documents from long conversations.

This module provides benchmarking for:
1. User profile inference from messages
2. User intent capture into structured schema
3. Document generation with proper source citation
4. Document quality assessment using LLM-as-a-judge

Author: GitHub Copilot
Date: September 14, 2025
"""

import json
import logging
import os
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime
import re
from difflib import SequenceMatcher
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class UserProfile:
    """Represents an inferred user profile from messages"""
    user_id: str
    role: str
    expertise_level: str  # novice, intermediate, expert
    communication_style: str  # concise, elaborative, bullet-pointed
    tone: str  # formal, direct, persuasive, empathetic
    domain_knowledge: List[str]
    project_involvement: List[str]
    confidence_score: float  # 0-1 score for inference confidence

@dataclass
class IntentSchema:
    """Represents structured intent captured from user query"""
    document_type: str  # status_report, summary, analysis, etc.
    target_audience: str  # executives, team_members, stakeholders
    temporal_scope: str  # last_week, project_start, specific_dates
    detail_level: str  # summary, detailed, comprehensive
    format_requirements: str  # bullet_points, paragraphs, tables_charts
    tone_preference: str  # formal, accessible, technical
    specific_topics: List[str]
    source_constraints: List[str]  # specific phases, authors, timeframes

@dataclass
class DocumentCitation:
    """Represents a citation in generated document"""
    message_id: str
    author: str
    timestamp: str
    cited_content: str
    context_relevance: float  # 0-1 score

@dataclass
class GeneratedDocument:
    """Represents the final generated document"""
    content: str
    citations: List[DocumentCitation]
    metadata: Dict[str, Any]
    generation_timestamp: str

@dataclass
class IntentEvaluationResult:
    """Enhanced intent evaluation result with detailed metrics"""
    overall_accuracy: float
    macro_f1_score: float
    per_field_precision: Dict[str, float]
    per_field_recall: Dict[str, float]
    per_field_f1: Dict[str, float]
    field_count: int

@dataclass
class BenchmarkResult:
    """Complete benchmark evaluation result"""
    query_id: str
    user_profile_accuracy: float
    intent_capture_accuracy: float  # Overall accuracy for backwards compatibility
    intent_evaluation: IntentEvaluationResult  # Enhanced intent metrics
    context_retrieval_accuracy: float
    citation_accuracy: float
    document_quality_score: float
    overall_score: float
    detailed_evaluation: Dict[str, Any]

@dataclass 
class ContextRetrievalResult:
    """Context retrieval evaluation result"""
    query_id: str
    retrieved_message_ids: List[str]
    ground_truth_message_ids: List[str]
    precision: float
    recall: float
    f1_score: float


# Utility functions for improved evaluation
def strict_categorical_match(predicted: str, expected: str) -> float:
    """
    Strict exact matching for categorical intent fields.
    Returns 1.0 for exact match, 0.0 otherwise.
    """
    if not predicted or not expected:
        return 0.0
    
    predicted_clean = predicted.lower().strip()
    expected_clean = expected.lower().strip()
    
    return 1.0 if predicted_clean == expected_clean else 0.0


def semantic_similarity_score(text1: str, text2: str) -> float:
    """
    Calculate semantic similarity between two text strings.
    Uses a combination of exact match, fuzzy matching, and keyword overlap.
    """
    if not text1 or not text2:
        return 0.0
    
    text1_lower = text1.lower().strip()
    text2_lower = text2.lower().strip()
    
    # Exact match
    if text1_lower == text2_lower:
        return 1.0
    
    # Fuzzy string matching
    fuzzy_score = SequenceMatcher(None, text1_lower, text2_lower).ratio()
    
    # Keyword overlap for roles/titles
    keywords1 = set(re.findall(r'\b\w+\b', text1_lower))
    keywords2 = set(re.findall(r'\b\w+\b', text2_lower))
    
    if keywords1 and keywords2:
        overlap = len(keywords1.intersection(keywords2))
        union = len(keywords1.union(keywords2))
        keyword_score = overlap / union if union > 0 else 0.0
    else:
        keyword_score = 0.0
    
    # Weighted combination: fuzzy (60%) + keyword overlap (40%)
    return 0.6 * fuzzy_score + 0.4 * keyword_score


def calculate_fuzzy_match_score(predicted_items: List[str], expected_items: List[str], threshold: float = 0.6) -> float:
    """
    Calculate fuzzy matching score between two lists of items.
    Returns the proportion of expected items that have a good fuzzy match in predicted items.
    """
    if not expected_items:
        return 1.0 if not predicted_items else 0.5
    
    if not predicted_items:
        return 0.0
    
    matches = 0
    for expected in expected_items:
        best_score = max(semantic_similarity_score(expected, pred) for pred in predicted_items)
        if best_score >= threshold:
            matches += 1
    
    return matches / len(expected_items)


def extract_keywords_from_text(text: str) -> List[str]:
    """Extract meaningful keywords from text for comparison."""
    # Remove common stop words and extract meaningful terms
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'}
    
    words = re.findall(r'\b\w+\b', text.lower())
    return [word for word in words if word not in stop_words and len(word) > 2]


class DocumentGenerationBenchmark:
    """Main benchmark class for document generation evaluation"""
    
    def __init__(self, azure_endpoint: str = None, model_name: str = "gpt-4", api_version: str = "2024-05-01-preview"):
        """Initialize the benchmark with Azure OpenAI client"""
        self.endpoint = azure_endpoint or os.getenv("ENDPOINT_URL", "https://winsightsboteus2.openai.azure.com/")
        self.model_name = model_name
        self.api_version = api_version

        # Initialize Azure OpenAI client
        token_provider = get_bearer_token_provider(
            DefaultAzureCredential(),
            "https://cognitiveservices.azure.com/.default"
        )
        
        self.client = AzureOpenAI(
            azure_endpoint=self.endpoint,
            azure_ad_token_provider=token_provider,
            api_version=self.api_version,
        )
        
        logger.info(f"Initialized DocumentGenerationBenchmark with model: {model_name}")
    
    def load_conversation_data(self, file_path: str) -> List[Dict]:
        """Load conversation data from JSON file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Handle different data formats
            if isinstance(data, dict):
                # Handle synthetic domain channels format
                all_messages = []
                for project_phase, messages in data.items():
                    for msg in messages:
                        if "content" in msg:
                            msg_copy = dict(msg)
                            msg_copy["project_phase"] = project_phase
                            all_messages.append(msg_copy)
                return sorted(all_messages, key=lambda x: x.get("timestamp", ""))
            elif isinstance(data, list):
                return data
            else:
                raise ValueError(f"Unexpected data format in {file_path}")
                
        except Exception as e:
            logger.error(f"Error loading conversation data from {file_path}: {e}")
            raise
    
    def load_user_queries(self, file_path: str) -> List[Dict]:
        """Load generated user queries from JSON file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading user queries from {file_path}: {e}")
            raise
    
    def infer_user_profile(self, messages: List[Dict], user_id: str) -> UserProfile:
        """
        Benchmark 1: Infer user profile from messages only
        """
        # Filter messages by user
        user_messages = [msg for msg in messages if msg.get("author") == user_id]
        
        if not user_messages:
            logger.warning(f"No messages found for user {user_id}")
            return UserProfile(
                user_id=user_id,
                role="Unknown",
                expertise_level="intermediate",
                communication_style="standard",
                tone="professional",
                domain_knowledge=[],
                project_involvement=[],
                confidence_score=0.0
            )
        
        # Prepare message content for analysis
        message_content = "\n".join([
            f"Message {i+1}: {msg.get('content', '')}"
            for i, msg in enumerate(user_messages[:10])  # Limit to recent messages
        ])
        
        profile_prompt = f"""
        Analyze the following messages from user '{user_id}' and infer their professional profile.
        
        Messages:
        {message_content}
        
        Based on these messages, please infer their profile using ONLY the following predefined options:
        
        1. Professional role: Choose the most fitting role from common workplace positions (e.g., Product Manager, Data Analyst, IT Systems Lead, Software Engineer, Project Manager, Business Analyst, etc.)
        
        2. Expertise level: Choose from [novice, intermediate, expert]
        
        3. Communication style: Choose from [concise, elaborative, standard, bullet-pointed]
        
        4. Tone: Choose from [formal, professional, technical, conversational, direct, persuasive, empathetic, accessible]
        
        5. Domain knowledge areas: List relevant technical/business domains
        
        6. Project involvement/responsibilities: List inferred responsibilities 
        
        7. Confidence score (0-1) for your inference
        
        IMPORTANT: You must select exact values from the predefined options for expertise_level, communication_style, and tone. Do not use synonyms or variations.
        
        Respond in JSON format:
        {{
            "role": "...",
            "expertise_level": "novice|intermediate|expert",
            "communication_style": "concise|elaborative|standard|bullet-pointed",
            "tone": "formal|professional|technical|conversational|direct|persuasive|empathetic|accessible",
            "domain_knowledge": ["...", "..."],
            "project_involvement": ["...", "..."],
            "confidence_score": 0.85
        }}
        """
        
        try:
            if self.model_name == "gpt-4.1" or self.model_name == "gpt-5-chat" or self.model_name == "gpt-4o":
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert at analyzing communication patterns to infer professional profiles. Respond only with valid JSON."},
                        {"role": "user", "content": profile_prompt}
                    ],
                    temperature=0.1,
                    max_tokens=10000
                )
            else:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert at analyzing communication patterns to infer professional profiles. Respond only with valid JSON."},
                        {"role": "user", "content": profile_prompt}
                    ],
                    max_completion_tokens=10000
                )
            
            response_content = response.choices[0].message.content.strip()
            logger.debug(f"Raw profile response: {response_content}")
            
            # Try to extract JSON from response if it's wrapped in markdown or other text
            if "```json" in response_content:
                start = response_content.find("```json") + 7
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            elif "```" in response_content:
                start = response_content.find("```") + 3
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            
            profile_data = json.loads(response_content)
            
            # Validate and constrain values to match user_query_generation.py candidates
            valid_expertise_levels = ["novice", "intermediate", "expert"]
            valid_communication_styles = ["concise", "elaborative", "standard", "bullet-pointed"]
            valid_tones = ["formal", "professional", "technical", "conversational", "direct", "persuasive", "empathetic", "accessible"]
            
            # Ensure values are from valid candidates, with fallbacks
            expertise_level = profile_data.get("expertise_level", "intermediate")
            if expertise_level not in valid_expertise_levels:
                expertise_level = "intermediate"  # Default fallback
                
            communication_style = profile_data.get("communication_style", "standard")  
            if communication_style not in valid_communication_styles:
                communication_style = "standard"  # Default fallback
                
            tone = profile_data.get("tone", "professional")
            if tone not in valid_tones:
                tone = "professional"  # Default fallback
            
            return UserProfile(
                user_id=user_id,
                role=profile_data.get("role", "Unknown"),
                expertise_level=expertise_level,
                communication_style=communication_style,
                tone=tone,
                domain_knowledge=profile_data.get("domain_knowledge", []),
                project_involvement=profile_data.get("project_involvement", []),
                confidence_score=profile_data.get("confidence_score", 0.0)
            )
            
        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing error inferring user profile for {user_id}: {e}")
            logger.error(f"Raw response content: {response_content if 'response_content' in locals() else 'No content'}")
            return UserProfile(
                user_id=user_id,
                role="JSON_PARSING_FAILED",
                expertise_level="novice",
                communication_style="concise", 
                tone="conversational",
                domain_knowledge=["ERROR: JSON parsing failed"],
                project_involvement=["ERROR: Could not parse JSON response"],
                confidence_score=0.0
            )
        except Exception as e:
            logger.error(f"Error inferring user profile for {user_id}: {e}")
            logger.error(f"Raw LLM response was: {response.choices[0].message.content if 'response' in locals() else 'No response received'}")
            return UserProfile(
                user_id=user_id,
                role="PARSING_FAILED",  # Make it obvious this failed
                expertise_level="novice",  # Use different defaults to avoid accidental matches
                communication_style="concise", 
                tone="conversational",
                domain_knowledge=["ERROR: Profile inference failed"],
                project_involvement=["ERROR: Could not parse LLM response"],
                confidence_score=0.0
            )
    
    def capture_user_intent(self, query: str, context: Dict) -> IntentSchema:
        """
        Benchmark 2: Capture user intent from query into structured schema
        """
        intent_prompt = f"""
        Analyze the following user query and extract the structured intent for document generation.
        
        User Query: "{query}"
        
        Context Information:
        - Document Type: {context.get('document_type', 'Unknown')}
        - Contextual Markers: {context.get('contextual_markers', {})}
        
        Extract and structure the following intent components using ONLY the predefined options:
        
        1. Document type: Choose from [status_report, email, faq]
        
        2. Target audience: Choose from [executives, team_members, stakeholders, management, clients, board]
        
        3. Temporal scope: Choose from [last_week, past_month, quarter, project_start, ongoing, upcoming, last_two_weeks]
        
        4. Detail level: Choose from [summary, detailed, comprehensive, high_level]
        
        5. Tone: Choose from [formal, technical, conversational, executive, urgent, celebratory, accessible]
        
        6. Format instruction: Describe specific formatting requirements (bullet_points, paragraphs, tables_charts, mixed, etc.)
        
        7. Document structure: List the main sections or topics that should be covered
        
        8. Visual elements: List any visual elements needed (charts_and_graphs, progress_bars, status_tables, etc.)
        
        IMPORTANT: You must select exact values from the predefined options for document_type, target_audience, temporal_scope, detail_level, and tone. Do not use synonyms or variations.
        
        Respond in JSON format:
        {{
            "document_type": "status_report|email|faq",
            "target_audience": "executives|team_members|stakeholders|management|clients|board",
            "temporal_scope": "last_week|past_month|quarter|project_start|ongoing|upcoming|last_two_weeks",
            "detail_level": "summary|detailed|comprehensive|high_level",
            "tone": "formal|technical|conversational|executive|urgent|celebratory|accessible",
            "format_instruction": "...",
            "document_structure": ["...", "..."],
            "visual_elements": ["...", "..."]
        }}
        """
        
        try:
            if self.model_name == "gpt-4.1" or self.model_name == "gpt-5-chat" or self.model_name == "gpt-4o":
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert at parsing document generation intents. Respond only with valid JSON."},
                        {"role": "user", "content": intent_prompt}
                    ],
                    temperature=0.1,
                    max_tokens=10000
                )
            else:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert at parsing document generation intents. Respond only with valid JSON."},
                        {"role": "user", "content": intent_prompt}
                    ],
                    max_completion_tokens=10000
                )
            
            response_content = response.choices[0].message.content.strip()
            logger.debug(f"Raw intent response: {response_content}")
            
            # Try to extract JSON from response if it's wrapped in markdown or other text
            if "```json" in response_content:
                start = response_content.find("```json") + 7
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            elif "```" in response_content:
                start = response_content.find("```") + 3
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            
            intent_data = json.loads(response_content)
            
            # Validate and constrain values to match user_query_generation.py candidates
            valid_document_types = ["status_report", "email", "faq"]
            valid_target_audiences = ["executives", "team_members", "stakeholders", "management", "clients", "board"]
            valid_temporal_scopes = ["last_week", "past_month", "quarter", "project_start", "ongoing", "upcoming", "last_two_weeks"]
            valid_detail_levels = ["summary", "detailed", "comprehensive", "high_level"]
            valid_tones = ["formal", "technical", "conversational", "executive", "urgent", "celebratory", "accessible"]
            
            # Ensure values are from valid candidates, with fallbacks
            document_type = intent_data.get("document_type", "status_report")
            if document_type not in valid_document_types:
                document_type = "status_report"  # Default fallback
                
            target_audience = intent_data.get("target_audience", "team_members")
            if target_audience not in valid_target_audiences:
                target_audience = "team_members"  # Default fallback
                
            temporal_scope = intent_data.get("temporal_scope", "ongoing")
            if temporal_scope not in valid_temporal_scopes:
                temporal_scope = "ongoing"  # Default fallback
                
            detail_level = intent_data.get("detail_level", "detailed")
            if detail_level not in valid_detail_levels:
                detail_level = "detailed"  # Default fallback
                
            # Handle both old and new field names for backward compatibility
            tone_value = intent_data.get("tone") or intent_data.get("tone_preference", "formal")
            if tone_value not in valid_tones:
                tone_value = "formal"  # Default fallback
            
            format_instruction = intent_data.get("format_instruction") or intent_data.get("format_requirements", "mixed")
            document_structure = intent_data.get("document_structure") or intent_data.get("specific_topics", [])
            visual_elements = intent_data.get("visual_elements", [])
            
            return IntentSchema(
                document_type=document_type,
                target_audience=target_audience,
                temporal_scope=temporal_scope,
                detail_level=detail_level,
                format_requirements=format_instruction,  # Map to old field name for compatibility
                tone_preference=tone_value,               # Map to old field name for compatibility  
                specific_topics=document_structure,       # Map to old field name for compatibility
                source_constraints=visual_elements        # Repurpose this field for visual elements
            )
            
        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing error capturing user intent: {e}")
            logger.error(f"Raw response content: {response_content if 'response_content' in locals() else 'No content'}")
            return IntentSchema(
                document_type="email",
                target_audience="executives",
                temporal_scope="last_week", 
                detail_level="summary",
                format_requirements="JSON_PARSING_FAILED",
                tone_preference="urgent",
                specific_topics=["ERROR: JSON parsing failed"],
                source_constraints=["ERROR: Could not parse JSON response"]
            )
        except Exception as e:
            logger.error(f"Error capturing user intent: {e}")
            logger.error(f"Raw LLM response was: {response.choices[0].message.content if 'response' in locals() else 'No response received'}")
            return IntentSchema(
                document_type="email",  # Use different defaults to avoid accidental matches
                target_audience="executives",
                temporal_scope="last_week", 
                detail_level="summary",
                format_requirements="PARSING_FAILED",
                tone_preference="urgent",
                specific_topics=["ERROR: Intent parsing failed"],
                source_constraints=["ERROR: Could not parse LLM response"]
            )
    
    def retrieve_relevant_context(self, messages: List[Dict], query: str, intent: IntentSchema, 
                                 ground_truth_markers: Dict = None) -> List[str]:
        """
        Benchmark: Context Retrieval - Select most relevant messages for the given query and intent
        
        Args:
            messages: Available messages (already temporally filtered)
            query: User's document generation query
            intent: Structured intent schema
            ground_truth_markers: Ground truth contextual markers (for determining target retrieval count)
            
        Returns:
            List of message IDs for the most relevant messages
        """
        
        # USE ALL TEMPORAL FILTERED MESSAGES AS CANDIDATES OR FOCUSED SET FOR OLDER MODELS
        candidate_messages = messages  # Default: use all temporally filtered messages
        num_target_messages = 10  # Default target retrieval count
        
        if ground_truth_markers:
            # Extract ground truth message IDs to determine target count
            gt_message_ids = set()
            if 'ground_truth_messages' in ground_truth_markers:
                gt_message_ids = set(ground_truth_markers['ground_truth_messages'])
            else:
                # Fallback: extract from contextual markers
                for category, items in ground_truth_markers.items():
                    if isinstance(items, list):
                        for item in items:
                            if isinstance(item, list) and len(item) >= 2:
                                gt_message_ids.add(item[1])
            
            # Set target number of messages to retrieve (same as ground truth)
            if gt_message_ids:
                num_target_messages = len(gt_message_ids)
            
            print(f"    Target retrieval count: {num_target_messages} (based on {len(gt_message_ids)} ground truth messages)")
            
            # For all models, create a focused candidate set to reduce token usage
            if gt_message_ids:
                # Find the latest timestamp among ground truth messages
                gt_messages_with_timestamps = []
                for msg in messages:
                    msg_id = msg.get('msg_node', msg.get('id', 'Unknown'))
                    if msg_id in gt_message_ids:
                        timestamp = msg.get('timestamp', msg.get('composeTime', ''))
                        if timestamp:
                            gt_messages_with_timestamps.append((msg, timestamp))
                
                if gt_messages_with_timestamps:
                    # Sort by timestamp and get the latest
                    gt_messages_with_timestamps.sort(key=lambda x: x[1])
                    latest_gt_timestamp = gt_messages_with_timestamps[-1][1]
                    
                    # Get 500 temporally nearby messages around the latest ground truth message
                    # This includes both before and after messages for better context
                    nearby_messages = []
                    for msg in messages:
                        msg_timestamp = msg.get('timestamp', msg.get('composeTime', ''))
                        if msg_timestamp:
                            nearby_messages.append((msg, msg_timestamp))
                    
                    # Sort all messages by timestamp
                    nearby_messages.sort(key=lambda x: x[1])
                    
                    # Find the index of the latest ground truth message
                    latest_gt_index = -1
                    for i, (msg, timestamp) in enumerate(nearby_messages):
                        if timestamp == latest_gt_timestamp:
                            latest_gt_index = i
                            break
                    
                    if latest_gt_index >= 0:
                        # Take 250 messages before and 250 messages after the latest ground truth message
                        start_index = max(0, latest_gt_index - 250)
                        end_index = min(len(nearby_messages), latest_gt_index + 251)  # +251 to include 250 after
                        focused_candidates = [msg for msg, _ in nearby_messages[start_index:end_index]]
                    else:
                        # Fallback: take last 500 messages before the latest ground truth timestamp
                        before_messages = [(msg, ts) for msg, ts in nearby_messages if ts <= latest_gt_timestamp]
                        focused_candidates = [msg for msg, _ in before_messages[-500:]] if len(before_messages) > 500 else [msg for msg, _ in before_messages]
                    
                    # Ensure all ground truth messages are included
                    gt_message_nodes = {msg.get('msg_node', msg.get('id', '')) for msg in focused_candidates}
                    for msg in messages:
                        msg_id = msg.get('msg_node', msg.get('id', 'Unknown'))
                        if msg_id in gt_message_ids and msg_id not in gt_message_nodes:
                            focused_candidates.append(msg)
                    
                    candidate_messages = focused_candidates
                    print(f"    Using focused candidate set: {len(candidate_messages)} messages (for model {self.model_name})")
                else:
                    print(f"    Could not determine ground truth timestamps, using all {len(candidate_messages)} messages")
        
        # Only print the generic message if we haven't already printed a specific one
        if not ground_truth_markers or self.model_name in ["gpt-4.1", "gpt-5-chat", "gpt-4o"]:
            print(f"    Using all {len(candidate_messages)} temporally filtered messages as candidates")
        
        context_prompt = f"""
        Given a user query and document intent, select the most relevant messages from the conversation history.
        
        User Query: "{query}"
        
        Document Intent:
        - Document Type: {intent.document_type}
        - Target Audience: {intent.target_audience}
        - Temporal Scope: {intent.temporal_scope}
        - Detail Level: {intent.detail_level}
        - Tone: {intent.tone_preference}
        - Specific Topics: {', '.join(intent.specific_topics) if intent.specific_topics else 'None'}
        
        Available Messages (all temporally filtered messages):
        {self._format_messages_for_retrieval(candidate_messages)}
        
        These are all {len(candidate_messages)} temporally filtered messages (messages that occurred before the query timestamp).
        
        Select the {num_target_messages} most relevant messages that would be needed to generate the requested document.
        Consider:
        1. Temporal relevance (matches the temporal scope)
        2. Content relevance (contains information needed for the document)
        3. Author relevance (messages from key stakeholders)
        4. Topic alignment (discusses relevant topics)
        5. No duplicated or near-duplicate messages
        
        Respond with a JSON list of message IDs in order of relevance:
        ["Msg_ID1", "Msg_ID2", "Msg_ID3", ...]
        """
        
        try:
            if self.model_name == "gpt-4.1" or self.model_name == "gpt-5-chat" or self.model_name == "gpt-4o":
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert at selecting relevant context for document generation. Respond only with valid JSON."},
                        {"role": "user", "content": context_prompt}
                    ],
                    temperature=0.1,
                    max_tokens=10000
                )
            else:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert at selecting relevant context for document generation. Respond only with valid JSON."},
                        {"role": "user", "content": context_prompt}
                    ],
                    max_completion_tokens=10000
                )
            
            response_content = response.choices[0].message.content.strip()
            
            # Check if response is empty
            if not response_content:
                logger.error("Empty response received from model in context retrieval")
                return [msg.get('msg_node', msg.get('id', f'msg_{i}')) for i, msg in enumerate(candidate_messages[:num_target_messages])]
            
            # Try to extract JSON if wrapped in markdown
            if "```json" in response_content:
                start = response_content.find("```json") + 7
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            elif "```" in response_content:
                start = response_content.find("```") + 3
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            
            retrieved_ids = json.loads(response_content)
            # Validate that all returned IDs exist in the candidate message set
            available_ids = {msg.get('msg_node', msg.get('id', 'Unknown')) for msg in candidate_messages}
            valid_ids = [id for id in retrieved_ids if id in available_ids]
            
            return valid_ids[:num_target_messages]
            
        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing error in context retrieval: {e}")
            logger.error(f"Raw response content: {response_content if 'response_content' in locals() else 'No response content available'}")
            # Fallback: return first num_target_messages message IDs from candidate messages
            return [msg.get('msg_node', msg.get('id', f'msg_{i}')) for i, msg in enumerate(candidate_messages[:num_target_messages])]
        except Exception as e:
            logger.error(f"Error in context retrieval: {e}")
            # Fallback: return first num_target_messages message IDs from candidate messages
            return [msg.get('msg_node', msg.get('id', f'msg_{i}')) for i, msg in enumerate(candidate_messages[:num_target_messages])]
    
    def _format_messages_for_retrieval(self, messages: List[Dict]) -> str:
        """Format messages for context retrieval prompt"""
        formatted = []
        for msg in messages:
            msg_id = msg.get('msg_node', msg.get('id', 'Unknown'))
            author = msg.get('author', 'Unknown')
            timestamp = msg.get('timestamp', 'Unknown')
            content = msg.get('content', '')  # Truncate long messages
            formatted.append(f"[{msg_id}] {author} ({timestamp}): {content}")
        return "\n".join(formatted)
    
    def evaluate_context_retrieval(self, retrieved_ids: List[str], ground_truth_markers: Dict) -> ContextRetrievalResult:
        """
        Evaluate context retrieval accuracy against ground truth message IDs
        
        Args:
            retrieved_ids: List of message IDs retrieved by the system
            ground_truth_markers: Contextual markers from query generation containing ground truth message IDs
            
        Returns:
            ContextRetrievalResult with precision, recall, and F1 scores
        """
        # Use the new direct ground truth messages if available
        if 'ground_truth_messages' in ground_truth_markers:
            ground_truth_ids = set(ground_truth_markers['ground_truth_messages'])
            print(f"    Using direct ground truth: {len(ground_truth_ids)} message IDs")
        else:
            # Fallback: Extract ground truth message IDs from contextual markers (old approach)
            print(f"    Fallback: Extracting from contextual markers")
            ground_truth_ids = set()
            
            # Check all categories in contextual markers for message references
            for category, items in ground_truth_markers.items():
                if isinstance(items, list):
                    for item in items:
                        if isinstance(item, list) and len(item) >= 2:
                            # Format: [content, message_id]
                            msg_id = item[1]
                            ground_truth_ids.add(msg_id)
                elif isinstance(items, dict):
                    # Handle nested dictionaries (like project_context)
                    continue
                continue
        
        retrieved_set = set(retrieved_ids)
        ground_truth_set = ground_truth_ids
        
        # Debug output
        print(f"    Retrieved: {retrieved_set}")
        print(f"    Ground truth: {ground_truth_set}")
        print(f"    Intersection: {retrieved_set.intersection(ground_truth_set)}")
        
        # Calculate precision, recall, and F1
        if not retrieved_set:
            precision = 0.0
        else:
            precision = len(retrieved_set.intersection(ground_truth_set)) / len(retrieved_set)
        
        if not ground_truth_set:
            recall = 1.0 if not retrieved_set else 0.0
        else:
            recall = len(retrieved_set.intersection(ground_truth_set)) / len(ground_truth_set)
        
        if precision + recall == 0:
            f1_score = 0.0
        else:
            f1_score = 2 * (precision * recall) / (precision + recall)
        
        print(f"    Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1_score:.3f}")
        
        return ContextRetrievalResult(
            query_id="",  # Will be set by caller
            retrieved_message_ids=retrieved_ids,
            ground_truth_message_ids=list(ground_truth_set),
            precision=precision,
            recall=recall,
            f1_score=f1_score
        )

    def generate_document_with_citations(self, 
                                       messages: List[Dict], 
                                       user_profile: UserProfile, 
                                       intent: IntentSchema) -> GeneratedDocument:
        """
        Benchmark 3: Generate document with proper source citations
        """
        # Create context for document generation
        message_context = "\n".join([
            f"[{msg.get('msg_node', 'Unknown')}] {msg.get('author', 'Unknown')} ({msg.get('timestamp', 'Unknown')}): {msg.get('content', '')}"
            for msg in messages  # Limit context size to prevent token overflow
        ])
        
        generation_prompt = f"""
        Create a professional {intent.document_type} based on the conversation context provided below.
        
        USER CONTEXT:
        - Role: {user_profile.role}
        - Expertise: {user_profile.expertise_level}
        - Communication Style: {user_profile.communication_style}
        - Preferred Tone: {user_profile.tone}
        
        DOCUMENT SPECIFICATIONS:
        - Document Type: {intent.document_type}
        - Target Audience: {intent.target_audience}
        - Time Scope: {intent.temporal_scope}
        - Detail Level: {intent.detail_level}
        - Format Requirements: {intent.format_requirements}
        - Tone Preference: {intent.tone_preference}
        - Key Topics: {', '.join(intent.specific_topics)}
        
        CONVERSATION MESSAGES (use these with proper citations):
        {message_context}
        
        IMPORTANT: Each message above has an ID in brackets [Msg_XXX]. When you reference information from any message, you MUST cite it using that exact ID.
        
        REQUIREMENTS:
        - Only create the requested document. 
        - MUST include citations when presenting facts, decisions, or information from the conversation
        - Every factual claim or decision referenced from the conversation MUST be cited
        - Citation format: [Msg_XXX] where XXX is the message identifier shown in brackets above
        - Include citations immediately after the relevant information or at the end of sentences
        - Do NOT make unsupported claims - only use information that can be traced back to the provided messages
        
        CITATION EXAMPLE:
        "The project deadline has been moved to next Friday [Msg_123]. The team completed the initial testing phase ahead of schedule [Msg_456]."
        
        Remember: If you reference ANY information from the conversation messages, you MUST cite it!
        """
        
        try:
            response = self._safe_llm_call_with_retry(generation_prompt)
            content = response.choices[0].message.content.strip()
            
            # Extract citations from generated content
            citations = self._extract_citations(content, messages)
            
            # Debug: Log citation information
            logger.info(f"Generated document length: {len(content)} chars, found {len(citations)} citations")
            if len(citations) == 0:
                logger.warning("No citations found in generated document!")
                logger.debug(f"Document content preview: {content[:200]}...")
            else:
                logger.info(f"Citations found: {[c.message_id for c in citations]}")
            
            return GeneratedDocument(
                content=content,
                citations=citations,
                metadata={
                    "user_profile": asdict(user_profile),
                    "intent": asdict(intent),
                    "source_message_count": len(messages)
                },
                generation_timestamp=datetime.now().isoformat()
            )
            
        except Exception as e:
            logger.error(f"Error generating document: {e}")
            
            # Check if it's a content filter error
            if "content_filter" in str(e) or "ResponsibleAIPolicyViolation" in str(e):
                logger.warning("Content filter triggered, attempting simplified generation...")
                return self._generate_fallback_document(intent, user_profile, messages)
            
            return GeneratedDocument(
                content=f"Error generating document: {e}",
                citations=[],
                metadata={},
                generation_timestamp=datetime.now().isoformat()
            )

    def _safe_llm_call_with_retry(self, prompt: str, max_retries: int = 2):
        """Make LLM call with retry logic for content filter issues"""
        for attempt in range(max_retries + 1):
            try:
                if self.model_name == "gpt-4.1" or self.model_name == "gpt-5-chat" or self.model_name == "gpt-4o":
                    response = self.client.chat.completions.create(
                        model=self.model_name,
                        messages=[
                            {"role": "system", "content": "You are a professional document writer who creates high-quality business documents with proper source citations. IMPORTANT: You MUST include citations in [Msg_XXX] format when referencing any information from the provided conversation messages. Every fact, decision, or piece of information from the conversation must be cited immediately after mentioning it."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.2 + (attempt * 0.1),  # Slightly increase temperature on retries
                        max_tokens=10000
                    )
                else:
                    response = self.client.chat.completions.create(
                        model=self.model_name,
                        messages=[
                            {"role": "system", "content": "You are a professional document writer who creates high-quality business documents with proper source citations. IMPORTANT: You MUST include citations in [Msg_XXX] format when referencing any information from the provided conversation messages. Every fact, decision, or piece of information from the conversation must be cited immediately after mentioning it."},
                            {"role": "user", "content": prompt}
                        ],
                        max_completion_tokens=10000
                    )
                return response
                
            except Exception as e:
                if "content_filter" in str(e) or "ResponsibleAIPolicyViolation" in str(e):
                    if attempt < max_retries:
                        logger.warning(f"Content filter triggered on attempt {attempt + 1}, retrying with modified prompt...")
                        # Simplify the prompt on retry
                        prompt = self._simplify_prompt_for_retry(prompt)
                        continue
                raise e
        
        raise Exception("Max retries exceeded for content filter issue")

    def _simplify_prompt_for_retry(self, original_prompt: str) -> str:
        """Simplify prompt to avoid content filter triggers"""
        # Remove potentially problematic phrases and make the prompt more basic
        simplified = original_prompt.replace("IMPORTANT:", "Note:")
        simplified = simplified.replace("must", "should")
        simplified = simplified.replace("only use", "please use")
        simplified = simplified.replace("directly traced", "referenced")
        simplified = simplified.replace("backed by", "supported by")
        
        # Make it even more basic if needed
        lines = simplified.split('\n')
        simplified_lines = [line for line in lines if not any(word in line.lower() for word in ["requirement", "constraint", "rule", "restrict"])]
        
        return '\n'.join(simplified_lines)

    def _generate_fallback_document(self, intent: IntentSchema, user_profile: UserProfile, messages: List[Dict]) -> GeneratedDocument:
        """Generate a basic fallback document when content filters are triggered"""
        try:
            # Very simple, safe prompt
            fallback_prompt = f"""
            Please write a brief {intent.document_type} about the topics discussed in these messages.
            
            Target audience: {intent.target_audience}
            
            Key messages:
            {chr(10).join([f"- {msg.get('content', '')[:100]}..." for msg in messages[:5]])}
            
            Please write in a {intent.tone_preference} tone.
            """
            if self.model_name == "gpt-4.1" or self.model_name == "gpt-5-chat" or self.model_name == "gpt-4o":
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that writes business documents."},
                        {"role": "user", "content": fallback_prompt}
                    ],
                    temperature=0.3,
                    max_tokens=10000
                )
            else:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that writes business documents."},
                        {"role": "user", "content": fallback_prompt}
                    ],
                    max_completion_tokens=10000
                )
            
            content = response.choices[0].message.content.strip()
            
            return GeneratedDocument(
                content=content,
                citations=[],  # No citations in fallback mode
                metadata={"fallback_generation": True},
                generation_timestamp=datetime.now().isoformat()
            )
            
        except Exception as e:
            logger.error(f"Even fallback generation failed: {e}")
            return GeneratedDocument(
                content=f"Unable to generate document due to content filtering. This may be due to sensitive content in the conversation context. Error: {str(e)[:200]}",
                citations=[],
                metadata={"generation_failed": True},
                generation_timestamp=datetime.now().isoformat()
            )
    
    def _extract_citations(self, content: str, messages: List[Dict]) -> List[DocumentCitation]:
        """Extract citations from generated content and validate against source messages"""
        citation_pattern = r'\[Msg_(\w+)\]'
        citations = []
        
        # Create message lookup
        msg_lookup = {msg.get('msg_node', ''): msg for msg in messages}
        
        for match in re.finditer(citation_pattern, content):
            msg_id = f"Msg_{match.group(1)}"
            if msg_id in msg_lookup:
                msg = msg_lookup[msg_id]
                citations.append(DocumentCitation(
                    message_id=msg_id,
                    author=msg.get('author', 'Unknown'),
                    timestamp=msg.get('timestamp', 'Unknown'),
                    cited_content=msg.get('content', '')[:200] + "...",  # Truncate for brevity
                    context_relevance=1.0  # To be calculated by evaluation
                ))
        
        return citations
    
    def evaluate_document_quality(self, document: GeneratedDocument, intent_schema: IntentSchema = None, 
                                 user_profile: UserProfile = None, query: str = "", ground_truth: Dict = None) -> Dict[str, float]:
        """
        Benchmark 4: Evaluate document quality using LLM-as-a-judge based on research paper metrics
        
        Research Paper Metrics:
        1. Personalization Fidelity: Document reflects inferred intent schema
        2. Factuality and Citation Quality: Claims supported by evidence 
        3. Fluency and Structure: Document coherence and appropriateness
        4. Temporal and Task Accuracy: Content alignment with timeframe/phase
        """
        
        # Prepare intent context for evaluation
        intent_context = ""
        if intent_schema:
            intent_context = f"""
        EXPECTED DOCUMENT SPECIFICATIONS:
        - Document Type: {intent_schema.document_type}
        - Target Audience: {intent_schema.target_audience}
        - Temporal Scope: {intent_schema.temporal_scope}
        - Detail Level: {intent_schema.detail_level}
        - Required Tone: {intent_schema.tone_preference}
        - Format Requirements: {intent_schema.format_requirements}
        - Specific Topics: {', '.join(intent_schema.specific_topics) if intent_schema.specific_topics else 'None specified'}
        """
        
        # Prepare user profile context
        profile_context = ""
        if user_profile:
            profile_context = f"""
        TARGET USER PROFILE:
        - Role: {user_profile.role}
        - Expertise Level: {user_profile.expertise_level}
        - Communication Style: {user_profile.communication_style}
        - Preferred Tone: {user_profile.tone}
        """
        
        # Prepare temporal and task context for better evaluation
        temporal_context = ""
        if intent_schema:
            # Extract temporal markers from citations to understand conversation timeline
            citation_timestamps = []
            if document.citations:
                for citation in document.citations:
                    if citation.timestamp and citation.timestamp != 'Unknown':
                        citation_timestamps.append(citation.timestamp)
            
            temporal_context = f"""
        TEMPORAL AND TASK CONTEXT:
        - Query Temporal Scope: {intent_schema.temporal_scope}
        - Citation Timestamps: {', '.join(citation_timestamps[:5]) if citation_timestamps else 'No timestamped citations'}
        - Expected Time Alignment: Content should reflect {intent_schema.temporal_scope} timeframe
        - Document Type Context: {intent_schema.document_type} typically covers specific time periods
        - Task Relevance: Should address current project phase or specified period
        """
        
        quality_prompt = f"""
        Evaluate the quality of the following generated document using a systematic evaluation process.
        
        ORIGINAL USER QUERY: "{query}"
        {intent_context}{profile_context}{temporal_context}
        
        DOCUMENT TO EVALUATE:
        {document.content}
        
        CITATIONS USED:
        {json.dumps([asdict(citation) for citation in document.citations], indent=2)}
        
        EVALUATION PROCESS:
        Evaluate each metric systematically using the specific guidelines below:
        
        FOR EACH METRIC, FOLLOW THESE DETAILED STEPS:
        
        === 1. PERSONALIZATION FIDELITY EVALUATION ===
        Step 1a: Identify document type from structure and content
        Step 1b: Compare identified type with expected type specification
        Step 1c: Analyze tone and style used throughout document
        Step 1d: Verify tone matches target audience and requirements
        Step 1e: Check temporal scope references in content
        Step 1f: Assess if detail level matches specified requirements
        Step 1g: Review format compliance with specified requirements
        Score 1-5: How well does document reflect intended specifications?
        
        === 2. FACTUALITY EVALUATION ===
        Step 2a: Identify all factual claims and assertions in document
        Step 2b: For each claim, locate corresponding citation and source
        Step 2c: Verify facts against actual cited source content
        Step 2d: Check for any unsupported or speculative statements
        Step 2e: Look for contradictions between claims and sources
        Step 2f: Assess overall factual accuracy and evidence backing
        Score 1-5: How well are claims supported by evidence?
        
        === 3. CITATION QUALITY EVALUATION ===
        Step 3a: Check all citation formats for proper [Msg_XXX] structure
        Step 3b: Verify each cited message ID exists and is accessible
        Step 3c: For each citation, confirm it supports the accompanying claim
        Step 3d: Assess appropriateness of citation placement in text
        Step 3e: Evaluate sufficiency of citation coverage for factual content
        Step 3f: Check for any missing citations for factual statements
        Score 1-5: How accurate and appropriate are the citations?
        
        === 4. FLUENCY EVALUATION ===
        Step 4a: Read through document checking for clarity and comprehension
        Step 4b: Identify any grammatical errors or awkward phrasing
        Step 4c: Assess logical flow and transitions between ideas
        Step 4d: Evaluate language appropriateness for target audience
        Step 4e: Check for engaging and professional writing style
        Step 4f: Review overall readability and coherence
        Score 1-5: How clear and well-written is the document?
        
        === 5. STRUCTURE EVALUATION ===
        Step 5a: Analyze overall document organization and logical flow
        Step 5b: Check if structure is appropriate for document type
        Step 5c: Evaluate headings, formatting, and visual layout
        Step 5d: Assess completeness of necessary sections
        Step 5e: Review adherence to professional document standards
        Step 5f: Check for logical progression from introduction to conclusion
        Score 1-5: How well-organized and structured is the document?
        
        === 6. TEMPORAL AND TASK ACCURACY EVALUATION ===
        Step 6a: Identify temporal scope specified in requirements
        Step 6b: Check all time references in document for accuracy
        Step 6c: Cross-reference content timeframe with citation timestamps
        Step 6d: Verify temporal expressions (dates, deadlines) are appropriate
        Step 6e: Assess if content reflects correct project phase/period
        Step 6f: Look for any temporal inconsistencies or anachronisms
        Score 1-5: How accurately does content align with specified timeframe?
        
        FINAL SCORING:
        For each metric, provide a score (1-5) based on your systematic evaluation:
        
        1. PERSONALIZATION FIDELITY: How well does document reflect intended specifications?
        2. FACTUALITY: How well are claims supported by evidence from sources?
        3. CITATION QUALITY: How accurate and appropriate are the citations?
        4. FLUENCY: How clear and well-written is the document?
        5. STRUCTURE: How well-organized and professionally formatted is the document?
        6. TEMPORAL AND TASK ACCURACY: How accurately does content align with specified timeframe?
        
        OVERALL SCORE: Calculate as average of the 6 individual metric scores.
           - Are any temporal inconsistencies or anachronisms present?
        
        Respond in JSON format:
        {{
            "personalization_fidelity": 4,
            "factuality": 3,
            "citation_quality": 4,
            "fluency": 5,
            "structure": 4,
            "temporal_task_accuracy": 4,
            "overall_score": 4.0,
            "detailed_feedback": "METRIC-BY-METRIC EVALUATION: [PERSONALIZATION FIDELITY] Steps 1a-1g assessment: [specific findings on type/tone/scope/detail/format alignment]. [FACTUALITY] Steps 2a-2f assessment: [specific findings on claim verification and evidence support]. [CITATION QUALITY] Steps 3a-3f assessment: [specific findings on format/accuracy/relevance/placement]. [FLUENCY] Steps 4a-4f assessment: [specific findings on clarity/grammar/flow/audience-appropriateness]. [STRUCTURE] Steps 5a-5f assessment: [specific findings on organization/formatting/professional standards]. [TEMPORAL ACCURACY] Steps 6a-6f assessment: [specific findings on timeframe alignment and consistency]. [OVERALL SUMMARY] Key strengths and improvement areas across all metrics."
        }}
        """
        
        try:
            if self.model_name == "gpt-4.1" or self.model_name == "gpt-5-chat" or self.model_name == "gpt-4o":
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert document quality evaluator. Provide objective, detailed assessments. Respond only with valid JSON."},
                        {"role": "user", "content": quality_prompt}
                    ],
                    temperature=0.1,
                    max_tokens=10000
                )
            else:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are an expert document quality evaluator. Provide objective, detailed assessments. Respond only with valid JSON."},
                        {"role": "user", "content": quality_prompt}
                    ],
                    max_completion_tokens=10000
                )
            
            response_content = response.choices[0].message.content.strip()
            
            # Check if response is empty or whitespace only
            if not response_content:
                logger.error("Empty response received from model in document quality evaluation")
                return {
                    "personalization_fidelity": 0,
                    "factuality": 0,
                    "fluency": 0,
                    "structure": 0,
                    "temporal_accuracy": 0,
                    "task_accuracy": 0,
                    "overall_score": 0.0,
                    "detailed_feedback": "ERROR: Empty response from model"
                }
            
            # Try to extract JSON if wrapped in markdown
            if "```json" in response_content:
                start = response_content.find("```json") + 7
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            elif "```" in response_content:
                start = response_content.find("```") + 3
                end = response_content.find("```", start)
                if end != -1:
                    response_content = response_content[start:end].strip()
            
            return json.loads(response_content)
            
        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing error in document quality evaluation: {e}")
            logger.error(f"Raw response content: {response_content if 'response_content' in locals() else 'No response content available'}")
            return {
                "personalization_fidelity": 0,
                "factuality": 0,
                "fluency": 0,
                "structure": 0,
                "temporal_accuracy": 0,
                "task_accuracy": 0,
                "overall_score": 0.0,
                "detailed_feedback": "ERROR: JSON parsing failed"
            }
        except Exception as e:
            logger.error(f"Error evaluating document quality: {e}")
            return {
                "personalization_fidelity": 0,
                "factuality": 0,
                "citation_quality": 0,
                "fluency": 0,
                "structure": 0,
                "temporal_task_accuracy": 0,
                "overall_score": 0,
                "detailed_feedback": f"Error during evaluation: {e}"
            }
    
    def run_comprehensive_benchmark(self, 
                                  conversation_file: str, 
                                  queries_file: str, 
                                  output_dir: str = "benchmark_results",
                                  max_queries: int = None,
                                  use_ground_truth: bool = False,
                                  use_fuzzy_intent_matching: bool = False) -> List[BenchmarkResult]:
        """
        Run the complete benchmark pipeline
        
        Args:
            conversation_file: Path to conversation data JSON file
            queries_file: Path to user queries JSON file
            output_dir: Directory to save results
            max_queries: Maximum number of queries to process
            use_ground_truth: If True, use ground truth persona/intent/context for document generation
                            If False, use predicted values (end-to-end evaluation)
            use_fuzzy_intent_matching: If True, uses fuzzy semantic similarity for intent evaluation.
                                     If False (default), uses strict exact matching for categorical fields.
        """
        logger.info("Starting comprehensive benchmark...")
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Load data
        messages = self.load_conversation_data(conversation_file)
        queries = self.load_user_queries(queries_file)
        
        if max_queries:
            queries = queries[:max_queries]
        
        results = []
        
        for i, query_data in enumerate(queries):
            logger.info(f"Processing query {i+1}/{len(queries)}")
            
            try:
                # Extract query information
                query = query_data.get("query", "")
                user_id = query_data.get("user_id", "Unknown")
                query_id = f"query_{i+1}"
                query_timestamp = query_data.get("query_timestamp")
                
                # TEMPORAL FILTERING: Filter all messages to only include those before query timestamp
                # This ensures all benchmarks work with realistic message sets
                filtered_messages = messages
                if query_timestamp:
                    try:
                        from datetime import datetime
                        query_time = datetime.fromisoformat(query_timestamp) if isinstance(query_timestamp, str) else query_timestamp
                        
                        temporal_filtered = []
                        for msg in messages:
                            msg_timestamp = msg.get('composeTime') or msg.get('timestamp')
                            if msg_timestamp:
                                try:
                                    # Handle different timestamp formats
                                    if isinstance(msg_timestamp, str):
                                        # Remove timezone info and convert
                                        clean_timestamp = msg_timestamp.replace('Z', '+00:00')
                                        msg_time = datetime.fromisoformat(clean_timestamp)
                                    else:
                                        msg_time = msg_timestamp
                                    
                                    # Only include messages that occurred before the query time
                                    if msg_time <= query_time:
                                        temporal_filtered.append(msg)
                                except Exception as e:
                                    # Include message if timestamp parsing fails
                                    temporal_filtered.append(msg)
                            else:
                                # Include messages without timestamps
                                temporal_filtered.append(msg)
                        
                        filtered_messages = temporal_filtered
                        logger.info(f"Filtered {len(messages)} messages to {len(filtered_messages)} messages before query time")
                        
                    except Exception as e:
                        logger.warning(f"Error filtering by timestamp: {e}, using all messages")
                        filtered_messages = messages
                
                # Now all benchmarks use temporally filtered messages
                # Benchmark 1: User profile inference
                user_profile = self.infer_user_profile(filtered_messages, user_id)
                
                # Benchmark 2: Intent capture
                intent = self.capture_user_intent(query, query_data)
                
                # Benchmark 3: Context retrieval using all temporally filtered messages
                retrieved_message_ids = self.retrieve_relevant_context(
                    filtered_messages, query, intent, 
                    ground_truth_markers=query_data.get("contextual_markers", {})
                )
                context_retrieval_result = self.evaluate_context_retrieval(
                    retrieved_message_ids, 
                    query_data.get("contextual_markers", {})
                )
                context_retrieval_result.query_id = query_id
                
                # For document generation: choose between predicted vs ground truth inputs
                if use_ground_truth:
                    # Use ground truth for document generation (isolates document generation performance)
                    gt_persona = query_data.get("persona", {})
                    gt_intent_data = query_data.get("intent", {})
                    gt_context_markers = query_data.get("contextual_markers", {})
                    
                    # Create ground truth user profile
                    doc_gen_profile = UserProfile(
                        user_id=user_id,
                        role=gt_persona.get("role", "Unknown"),
                        expertise_level=gt_persona.get("expertise", "intermediate"),
                        communication_style=gt_persona.get("style", "standard"),
                        tone=gt_persona.get("tone", "professional"),
                        domain_knowledge=[],
                        project_involvement=[],
                        confidence_score=1.0
                    )
                    
                    # Create ground truth intent schema
                    doc_gen_intent = IntentSchema(
                        document_type=gt_intent_data.get("document_type", "status_report"),
                        target_audience=gt_intent_data.get("target_audience", "team_members"),
                        temporal_scope=gt_intent_data.get("temporal_scope", "ongoing"),
                        detail_level=gt_intent_data.get("detail_level", "detailed"),
                        format_requirements=gt_intent_data.get("format_instruction", "mixed"),
                        tone_preference=gt_intent_data.get("tone", "formal"),
                        specific_topics=gt_intent_data.get("document_structure", []),
                        source_constraints=[]
                    )
                    
                    # Filter messages using ground truth context markers
                    gt_message_ids = set()
                    for category, items in gt_context_markers.items():
                        if isinstance(items, list):
                            for item in items:
                                if isinstance(item, list) and len(item) >= 2:
                                    gt_message_ids.add(item[1])
                    
                    # Filter messages to only include ground truth relevant ones
                    doc_gen_messages = [msg for msg in filtered_messages 
                                      if msg.get('msg_node', msg.get('id', '')) in gt_message_ids]
                    if not doc_gen_messages:
                        doc_gen_messages = filtered_messages  # Fallback to filtered messages if none found
                        
                    logger.info(f"Using ground truth: {len(doc_gen_messages)} context messages")
                else:
                    # Use predicted values for end-to-end evaluation
                    doc_gen_profile = user_profile
                    doc_gen_intent = intent
                    
                    # Use the messages retrieved by the context retrieval step (not all messages!)
                    retrieved_messages = [msg for msg in filtered_messages 
                                        if msg.get('msg_node', msg.get('id', '')) in retrieved_message_ids]
                    doc_gen_messages = retrieved_messages if retrieved_messages else filtered_messages[:20]  # Fallback to first 20 if none found
                    
                    logger.info(f"Using predicted values: {len(doc_gen_messages)} retrieved context messages")
                
                # Benchmark 4: Document generation with citations
                document = self.generate_document_with_citations(doc_gen_messages, doc_gen_profile, doc_gen_intent)
                
                # Benchmark 5: Document quality evaluation
                quality_scores = self.evaluate_document_quality(
                    document, 
                    intent_schema=doc_gen_intent, 
                    user_profile=doc_gen_profile, 
                    query=query
                )
                
                # Calculate accuracy scores (to be implemented based on ground truth)
                profile_accuracy = self._calculate_profile_accuracy(user_profile, query_data.get("persona", {}))
                intent_evaluation = self._calculate_intent_accuracy(intent, query_data, use_fuzzy_intent_matching)
                context_retrieval_accuracy = context_retrieval_result.f1_score
                citation_accuracy = self._calculate_citation_accuracy(document.citations, query_data.get("contextual_markers", {}))
                
                # Create benchmark result
                result = BenchmarkResult(
                    query_id=query_id,
                    user_profile_accuracy=profile_accuracy,
                    intent_capture_accuracy=intent_evaluation.overall_accuracy,  # For backwards compatibility
                    intent_evaluation=intent_evaluation,  # Enhanced intent metrics
                    context_retrieval_accuracy=context_retrieval_accuracy,
                    citation_accuracy=citation_accuracy,
                    document_quality_score=quality_scores.get("overall_score", 0),
                    overall_score=(profile_accuracy + intent_evaluation.overall_accuracy + context_retrieval_accuracy + citation_accuracy + quality_scores.get("overall_score", 0)) / 5,
                    detailed_evaluation={
                        "user_profile": asdict(user_profile),
                        "intent": asdict(intent),
                        "context_retrieval": asdict(context_retrieval_result),
                        "document": asdict(document),
                        "quality_scores": quality_scores,
                        "ground_truth": query_data,
                        "evaluation_mode": "ground_truth" if use_ground_truth else "end_to_end",
                        "document_generation_inputs": {
                            "profile_source": "ground_truth" if use_ground_truth else "predicted",
                            "intent_source": "ground_truth" if use_ground_truth else "predicted", 
                            "context_source": "ground_truth" if use_ground_truth else "predicted"
                        }
                    }
                )
                
                results.append(result)
                
                # Save intermediate results
                result_file = os.path.join(output_dir, f"{query_id}_result.json")
                with open(result_file, 'w', encoding='utf-8') as f:
                    json.dump(asdict(result), f, indent=2, ensure_ascii=False)
                
            except Exception as e:
                logger.error(f"Error processing query {i+1}: {e}")
                continue
        
        # Save comprehensive results
        summary_file = os.path.join(output_dir, "benchmark_summary.json")
        self._save_benchmark_summary(results, summary_file)
        
        logger.info(f"Benchmark completed. Results saved to {output_dir}")
        return results
    
    def _calculate_profile_accuracy(self, inferred: UserProfile, ground_truth: Dict) -> float:
        """Calculate accuracy of user profile inference using semantic similarity"""
        if not ground_truth:
            return 0.5  # Default score when no ground truth available
        
        scores = []
        
        # Compare role with semantic similarity
        if ground_truth.get("role"):
            role_score = semantic_similarity_score(inferred.role, ground_truth["role"])
            scores.append(role_score)
        
        # Compare tone with semantic similarity
        if ground_truth.get("tone"):
            tone_score = semantic_similarity_score(inferred.tone, ground_truth["tone"])
            scores.append(tone_score)
        
        # Compare communication style with semantic similarity
        if ground_truth.get("style"):
            style_score = semantic_similarity_score(inferred.communication_style, ground_truth["style"])
            scores.append(style_score)
        
        # Compare expertise level with semantic similarity
        if ground_truth.get("expertise"):
            expertise_score = semantic_similarity_score(inferred.expertise_level, ground_truth["expertise"])
            scores.append(expertise_score)
        
        return sum(scores) / len(scores) if scores else 0.5
    
    def _calculate_intent_accuracy(self, inferred: IntentSchema, ground_truth: Dict, use_fuzzy_matching: bool = False) -> IntentEvaluationResult:
        """
        Calculate enhanced intent evaluation metrics with detailed per-field analysis.
        
        Args:
            inferred: The inferred intent schema
            ground_truth: Ground truth query data (intent will be extracted from ground_truth["intent"])
            use_fuzzy_matching: If True, uses fuzzy semantic similarity. If False (default), uses strict exact matching.
            
        Returns:
            IntentEvaluationResult with overall accuracy, macro-F1, and per-field precision/recall/F1
        """
        if not ground_truth:
            return IntentEvaluationResult(
                overall_accuracy=0.5,
                macro_f1_score=0.5,
                per_field_precision={},
                per_field_recall={},
                per_field_f1={},
                field_count=0
            )
        
        # Extract intent portion from ground truth data
        ground_truth_intent = ground_truth.get("intent", {})
        if not ground_truth_intent:
            logger.warning("No intent found in ground truth data")
            return IntentEvaluationResult(
                overall_accuracy=0.5,
                macro_f1_score=0.5,
                per_field_precision={},
                per_field_recall={},
                per_field_f1={},
                field_count=0
            )
        
        logger.debug(f"Ground truth intent fields: {list(ground_truth_intent.keys())}")
        logger.debug(f"Predicted intent: {inferred}")

        # Define categorical fields for strict evaluation - map predicted fields to ground truth fields
        categorical_fields = [
            ("document_type", "document_type"),
            ("target_audience", "target_audience"), 
            ("detail_level", "detail_level"),
            ("temporal_scope", "temporal_scope"),
            ("tone_preference", "tone")  # Ground truth uses "tone", not "tone_preference"
        ]
        
        # Choose scoring function based on parameter
        scoring_func = semantic_similarity_score if use_fuzzy_matching else strict_categorical_match
        
        field_results = {}
        scores = []
        
        # Evaluate categorical fields
        for field_name, gt_key in categorical_fields:
            logger.debug(f"Checking field {field_name} -> {gt_key}: ground_truth_intent has {gt_key}={ground_truth_intent.get(gt_key)}")
            if ground_truth_intent.get(gt_key):
                predicted_value = getattr(inferred, field_name)
                expected_value = ground_truth_intent[gt_key]
                logger.debug(f"Evaluating {field_name}: predicted='{predicted_value}' vs expected='{expected_value}'")
                
                # For binary classification (correct/incorrect), precision = recall = accuracy
                score = scoring_func(predicted_value, expected_value)
                logger.debug(f"Score for {field_name}: {score}")
                
                field_results[field_name] = {
                    "accuracy": score,
                    "precision": score,  # Binary classification: precision = accuracy
                    "recall": score,     # Binary classification: recall = accuracy  
                    "f1": score          # Binary classification: F1 = accuracy
                }
                scores.append(score)
            else:
                logger.debug(f"Skipping {field_name} because ground_truth_intent[{gt_key}] is missing or empty")
        
        # Evaluate free-text fields with fuzzy matching (always fuzzy for these)
        free_text_fields = [
            ("format_requirements", "format_instruction"),  # Ground truth uses "format_instruction" 
            ("specific_topics", "document_structure"),       # Ground truth uses "document_structure"
            ("source_constraints", "visual_elements")        # Ground truth uses "visual_elements"
        ]
        
        for field_name, gt_key in free_text_fields:
            if ground_truth.get(gt_key):
                if field_name == "format_requirements":
                    predicted_value = [inferred.format_requirements]
                    expected_value = [ground_truth[gt_key]]
                elif field_name == "specific_topics":
                    predicted_value = inferred.specific_topics
                    expected_value = ground_truth[gt_key] if isinstance(ground_truth[gt_key], list) else [ground_truth[gt_key]]
                
                score = calculate_fuzzy_match_score(predicted_value, expected_value, threshold=0.5)
                
                field_results[field_name] = {
                    "accuracy": score,
                    "precision": score,
                    "recall": score,
                    "f1": score
                }
                scores.append(score)
        
        # Calculate aggregate metrics
        overall_accuracy = sum(scores) / len(scores) if scores else 0.5
        
        # Extract per-field metrics
        per_field_precision = {field: results["precision"] for field, results in field_results.items()}
        per_field_recall = {field: results["recall"] for field, results in field_results.items()}
        per_field_f1 = {field: results["f1"] for field, results in field_results.items()}
        
        # Calculate macro-F1 (average of per-field F1 scores)
        macro_f1_score = sum(per_field_f1.values()) / len(per_field_f1) if per_field_f1 else 0.5
        
        return IntentEvaluationResult(
            overall_accuracy=overall_accuracy,
            macro_f1_score=macro_f1_score,
            per_field_precision=per_field_precision,
            per_field_recall=per_field_recall,
            per_field_f1=per_field_f1,
            field_count=len(field_results)
        )
    
    def _calculate_citation_accuracy(self, citations: List[DocumentCitation], ground_truth: Dict) -> float:
        """Calculate accuracy of citations with fuzzy matching and semantic relevance"""
        if not ground_truth or not ground_truth.get("entities"):
            return 0.5
        
        cited_msg_ids = {citation.message_id for citation in citations}
        expected_msg_ids = {entity[1] for entity in ground_truth["entities"] if len(entity) > 1}
        
        if not expected_msg_ids:
            return 0.5
        
        if not cited_msg_ids:
            return 0.0
        
        # Calculate exact overlap (precision for exact matches)
        exact_overlap = len(cited_msg_ids.intersection(expected_msg_ids))
        
        # Calculate recall (how many expected messages were cited)
        recall = exact_overlap / len(expected_msg_ids) if expected_msg_ids else 0.0
        
        # Calculate precision (how many cited messages were relevant)
        precision = exact_overlap / len(cited_msg_ids) if cited_msg_ids else 0.0
        
        # Add partial credit for messages from same conversation threads or time windows
        # This accounts for related but not exactly matching citations
        partial_credit = 0.0
        if exact_overlap < len(expected_msg_ids):  # Only if not all expectations met
            # Check for related messages (same author, similar timestamp, etc.)
            for expected_id in expected_msg_ids:
                if expected_id not in cited_msg_ids:
                    # Look for related citations (this is a simplified heuristic)
                    for cited_id in cited_msg_ids:
                        if expected_id not in cited_msg_ids:
                            # Give partial credit for citing messages from same conversation
                            # This is a simplified approach - in practice you'd use conversation threading
                            try:
                                exp_num = int(expected_id.split('_')[-1]) if '_' in expected_id else 0
                                cited_num = int(cited_id.split('_')[-1]) if '_' in cited_id else 0
                                if abs(exp_num - cited_num) <= 5:  # Within 5 messages
                                    partial_credit += 0.3  # 30% credit for nearby messages
                                    break
                            except (ValueError, IndexError):
                                continue
        
        # Normalize partial credit
        partial_credit = min(partial_credit / len(expected_msg_ids), 0.3) if expected_msg_ids else 0.0
        
        # Final score: weighted combination of recall, precision, and partial credit
        if recall == 0.0 and precision == 0.0:
            return partial_credit
        
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        return min(f1_score + partial_credit, 1.0)
    
    def _save_benchmark_summary(self, results: List[BenchmarkResult], output_file: str):
        """Save benchmark summary statistics with enhanced intent metrics"""
        if not results:
            return
        
        # Calculate per-field intent metrics averages
        all_fields = set()
        for result in results:
            all_fields.update(result.intent_evaluation.per_field_precision.keys())
        
        per_field_precision_avg = {}
        per_field_recall_avg = {}
        per_field_f1_avg = {}
        
        for field in all_fields:
            field_precisions = [r.intent_evaluation.per_field_precision.get(field, 0) for r in results]
            field_recalls = [r.intent_evaluation.per_field_recall.get(field, 0) for r in results]
            field_f1s = [r.intent_evaluation.per_field_f1.get(field, 0) for r in results]
            
            per_field_precision_avg[field] = sum(field_precisions) / len(field_precisions)
            per_field_recall_avg[field] = sum(field_recalls) / len(field_recalls)
            per_field_f1_avg[field] = sum(field_f1s) / len(field_f1s)
        
        summary = {
            "total_queries": len(results),
            "average_scores": {
                "user_profile_accuracy": sum(r.user_profile_accuracy for r in results) / len(results),
                "intent_capture_accuracy": sum(r.intent_capture_accuracy for r in results) / len(results),
                "intent_macro_f1": sum(r.intent_evaluation.macro_f1_score for r in results) / len(results),
                "context_retrieval_accuracy": sum(r.context_retrieval_accuracy for r in results) / len(results),
                "citation_accuracy": sum(r.citation_accuracy for r in results) / len(results),
                "document_quality_score": sum(r.document_quality_score for r in results) / len(results),
                "overall_score": sum(r.overall_score for r in results) / len(results)
            },
            "intent_detailed_metrics": {
                "per_field_precision": per_field_precision_avg,
                "per_field_recall": per_field_recall_avg,
                "per_field_f1": per_field_f1_avg,
                "average_macro_f1": sum(r.intent_evaluation.macro_f1_score for r in results) / len(results),
                "evaluated_fields": list(all_fields)
            },
            "score_distribution": {
                "excellent": len([r for r in results if r.overall_score >= 4.0]),
                "good": len([r for r in results if 3.0 <= r.overall_score < 4.0]),
                "fair": len([r for r in results if 2.0 <= r.overall_score < 3.0]),
                "poor": len([r for r in results if r.overall_score < 2.0])
            },
            "timestamp": datetime.now().isoformat(),
            "detailed_results": [asdict(result) for result in results]
        }
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)


def main():
    """Main function to run the benchmark"""
    # Configuration
    conversation_file = "../data/Finance/synthetic_domain_channels_Finance.json"
    queries_file = "./synthetic_queries/generated_user_queries_Finance.json"
    output_dir = "./benchmark_results"
    max_queries = 10  # Limit for testing
    
    # Initialize benchmark
    benchmark = DocumentGenerationBenchmark()
    
    # Run comprehensive benchmark
    results = benchmark.run_comprehensive_benchmark(
        conversation_file=conversation_file,
        queries_file=queries_file,
        output_dir=output_dir,
        max_queries=max_queries
    )
    
    # Print summary
    if results:
        avg_score = sum(r.overall_score for r in results) / len(results)
        print(f"\nBenchmark Summary:")
        print(f"Total Queries Processed: {len(results)}")
        print(f"Average Overall Score: {avg_score:.2f}")
        print(f"Results saved to: {output_dir}")


if __name__ == "__main__":
    main()
