import networkx as nx
import random
import json
import os
from datetime import datetime, timedelta
from collections import defaultdict, Counter
import re
import ctypes
from typing import Dict, List, Tuple, Set, Optional
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI


def call_llm(prompt: str, client) -> str:
    """Call the LLM with the given prompt"""
    try:
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are an expert at generating realistic user queries for document generation. "
                        "Generate natural, context-aware queries that reflect real user behavior and needs. "
                        "Make the queries specific, actionable, and appropriate for the given context."
                    )
                },
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=1000,
            presence_penalty=0.3,
            frequency_penalty=0.3
        )
        content = response.choices[0].message.content
        return content.strip() if content else "[Error: LLM returned no content]"
    except Exception as e:
        return f"[Error generating query: {e}]"


class UserQueryGenerator:
    """
    Generates synthetic user queries for document generation (status reports and emails)
    based on knowledge graph structure and contextual markers.
    """
    
    def __init__(self, graph_path: str, use_llm_for_entities: bool = False):
        """Initialize the generator with a graph file."""
        self.graph = nx.read_gml(graph_path)
        self.contextual_markers = {}
        self.use_llm_for_entities = use_llm_for_entities
        
        # Initialize Azure OpenAI client
        endpoint = os.getenv("ENDPOINT_URL", "https://winsightsboteus2.openai.azure.com/")
        token_provider = get_bearer_token_provider(
            DefaultAzureCredential(),
            "https://cognitiveservices.azure.com/.default"
        )
        self.client = AzureOpenAI(
            azure_endpoint=endpoint,
            azure_ad_token_provider=token_provider,
            api_version="2024-05-01-preview",
        )
        
    def generate_query_prompt(self, intent_schema: Dict, contextual_markers: Dict, user_persona: Dict) -> str:
        """Generate a prompt for the LLM to create a user query based on intent schema."""
        
        context = contextual_markers['project_context']
        
        # Map intent to business contexts
        urgency_map = {
            'urgent': 'quickly',
            'formal': 'for leadership',
            'conversational': 'for the team',
            'executive': 'for senior management'
        }
        
        temporal_map = {
            'last_week': 'recent progress',
            'past_month': 'monthly update', 
            'quarter': 'quarterly review',
            'ongoing': 'current status',
            'upcoming': 'future plans'
        }
        
        audience_map = {
            'executives': 'leadership',
            'team_members': 'the team',
            'stakeholders': 'stakeholders',
            'management': 'management',
            'clients': 'external partners'
        }
        
        # Convert document structure to natural content hints
        document_structure = intent_schema.get('document_structure', [])
        content_hints = []
        
        # Map technical document sections to natural language hints
        section_to_hint = {
            'executive_summary': 'high-level overview',
            'project_overview': 'project background',
            'timeline_and_milestones': 'timeline and key dates',
            'progress_highlights': 'key progress',
            'key_achievements': 'wins and accomplishments',
            'completed_deliverables': 'what we\'ve delivered',
            'current_phase_status': 'where we stand now',
            'upcoming_deadlines': 'what\'s coming up',
            'next_steps': 'next actions',
            'action_items': 'action items',
            'challenges_and_blockers': 'issues and blockers',
            'risks_and_mitigation': 'risks and concerns',
            'budget_status': 'budget and costs',
            'resource_allocation': 'team and resources',
            'team_performance': 'team performance',
            'quality_metrics': 'quality and metrics',
            'stakeholder_feedback': 'stakeholder input',
            'compliance_status': 'compliance status',
            'lessons_learned': 'lessons learned',
            'dependencies': 'dependencies',
            'technical_architecture': 'technical details',
            'testing_results': 'testing outcomes',
            'deployment_status': 'deployment progress',
            # Email sections
            'summary_update': 'summary',
            'key_decisions_made': 'decisions made',
            'timeline_updates': 'timeline changes',
            'milestone_achievements': 'milestones reached',
            'urgent_matters': 'urgent issues',
            'blockers_requiring_attention': 'blockers',
            'stakeholder_updates': 'stakeholder news',
            'budget_implications': 'budget impact',
            'resource_needs': 'resource requirements',
            'meeting_outcomes': 'meeting results',
            'deliverable_status': 'deliverable updates',
            'risk_alerts': 'risk updates',
            'approvals_needed': 'pending approvals',
            'schedule_changes': 'schedule updates',
            'team_announcements': 'team news',
            'technical_updates': 'technical changes',
            # FAQ sections
            'getting_started': 'getting started guide',
            'common_questions': 'frequently asked questions',
            'troubleshooting_guide': 'troubleshooting help',
            'best_practices': 'best practices',
            'process_workflows': 'process steps',
            'system_requirements': 'system requirements',
            'access_and_permissions': 'access instructions',
            'contact_information': 'contact details',
            'escalation_procedures': 'escalation process',
            'technical_specifications': 'technical specs',
            'integration_guidelines': 'integration guide',
            'security_policies': 'security requirements',
            'compliance_requirements': 'compliance rules',
            'training_resources': 'training materials',
            'known_issues': 'known problems',
            'feature_explanations': 'feature descriptions',
            'configuration_steps': 'setup instructions'
        }
        
        # Convert 2-3 key document sections to natural hints
        for section in document_structure[:3]:  # Use first 3 sections
            if section in section_to_hint:
                content_hints.append(section_to_hint[section])
        
        # Build natural context hints
        context_hints = []
        
        tone = intent_schema.get('tone', '')
        if tone in urgency_map:
            context_hints.append(urgency_map[tone])
        
        temporal = intent_schema.get('temporal_scope', '')
        if temporal in temporal_map:
            context_hints.append(temporal_map[temporal])
            
        audience = intent_schema.get('target_audience', '')
        if audience in audience_map:
            context_hints.append(f"for {audience_map[audience]}")
        
        hint_text = ", ".join(context_hints) if context_hints else "about the project"
        content_hint_text = ", ".join(content_hints) if content_hints else ""
        
        prompt = f"""
                Generate a concise but natural workplace request about {context.get('topic', 'the project')}.
                
                Context:
                - Project: {context.get('project', 'Unknown')}
                - Topic: {context.get('topic', 'Unknown')}
                - User Role: {user_persona.get('role', 'Team Member')}
                
                The user needs information {hint_text}{', including ' + content_hint_text if content_hint_text else ''}.
                
                Requirements:
                1. DO NOT mention document types like "status report", "email", or "FAQ"
                2. Use natural business language
                3. Make the request sound conversational and realistic
                4. Reference the project/topic naturally
                5. Subtly hint at the content areas needed without being too explicit
                
                Examples:
                - "Can you help me get up to speed on where we are with fraud detection - I need the key progress and what's coming up next?"
                - "I need to brief the executives on our compliance progress - what's the latest with timeline changes and any blockers?"
                - "New team members keep asking about the treasury system - we should document the getting started guide and common questions"
                
                Generate ONE natural request:
                """
        
        return prompt.strip()
    
    def extract_contextual_markers(self, target_node_id: int, target_type: str, query_timestamp: str = None) -> Dict:
        """
        Extract contextual markers from connected message nodes with temporal filtering.
        Uses direct graph edges for efficient and accurate message discovery.
        
        Args:
            target_node_id: ID of the target node (phase or topic)
            target_type: Type of target ('phase' or 'topic')
            query_timestamp: When the query is being made (filter messages before this time)
            
        Returns:
            Dictionary containing extracted contextual markers
        """
        markers = {
            'entities': Counter(),
            'temporal_expressions': [],
            'user_actions': [],
            'metadata': {},
            'key_decisions': [],
            'unresolved_questions': [],
            'mentioned_tools': [],
            'deliverable_sources': [],
            'project_context': {},
            # Track message IDs for each extracted marker type
            'entity_sources': [],  # [(entity, message_id)]
            'temporal_sources': [],  # [(expression, message_id)]
            'action_sources': [],  # [(action, message_id)]
            'decision_sources': [],  # [(decision, message_id)]
            'question_sources': [],  # [(question, message_id)]
            'tool_sources': [],  # [(tool, message_id)]
            'deliverable_sources_with_ids': []  # [(deliverable/link, message_id)]
        }
        
        # Get target node information
        target_node = self.graph.nodes[target_node_id]
        
        print(f"  Target node info: type={target_node.get('type')}, label={target_node.get('label', 'N/A')}")
        
        # Use direct edge traversal to find connected messages
        connected_messages = []
        
        if target_type == 'phase':
            # For phases: Find messages directly connected via 'discussed_in_phase' edges
            print(f"    Looking for messages directly connected to phase: {target_node_id}")
            
            for source, target in self.graph.edges():
                if target == target_node_id and self.graph.nodes[source].get('type') == 'message':
                    edge_data = self.graph[source][target]
                    if edge_data.get('relation') == 'discussed_in_phase':
                        message_data = self.graph.nodes[source]
                        connected_messages.append((source, message_data))
            
            print(f"    Found {len(connected_messages)} messages directly connected to phase")
            
        elif target_type == 'topic':
            # For topics: First get all phases connected to this topic, then get their messages
            target_topic_label = target_node.get('label', target_node_id)
            print(f"    Looking for phases connected to topic: {target_topic_label}")
            
            # Find all phases connected to this topic via 'has_phase' edges
            connected_phases = []
            for source, target in self.graph.edges():
                if target == target_node_id and self.graph.nodes[source].get('type') == 'phase':
                    edge_data = self.graph[source][target]
                    if edge_data.get('relation') == 'has_phase':
                        connected_phases.append(source)
            
            print(f"    Found {len(connected_phases)} phases connected to topic")
            
            # Now get all messages from these phases via 'discussed_in_phase' edges
            for phase_id in connected_phases:
                for source, target in self.graph.edges():
                    if target == phase_id and self.graph.nodes[source].get('type') == 'message':
                        edge_data = self.graph[source][target]
                        if edge_data.get('relation') == 'discussed_in_phase':
                            message_data = self.graph.nodes[source]
                            # Avoid duplicates
                            if not any(msg_id == source for msg_id, _ in connected_messages):
                                connected_messages.append((source, message_data))
            
            print(f"    Found {len(connected_messages)} total messages across topic phases")
        
        # Fallback: If no messages found through direct edges, try broader search
        if len(connected_messages) == 0:
            print("  No messages found through direct edges, using fallback search...")
            
            # Find any messages in the graph as a last resort
            message_count = 0
            for node_id, node_data in self.graph.nodes(data=True):
                if node_data.get('type') == 'message':
                    message_count += 1
                    if len(connected_messages) < 10:  # Take up to 10 messages as fallback
                        connected_messages.append((node_id, node_data))
            
            print(f"    Found {message_count} total messages in graph, using {len(connected_messages)} as fallback")
        
        print(f"  Found {len(connected_messages)} connected messages for {target_type} {target_node_id}")
        
        # Extract markers from messages with temporal filtering
        query_time = None
        if query_timestamp:
            try:
                query_time = datetime.fromisoformat(query_timestamp) if isinstance(query_timestamp, str) else query_timestamp
            except:
                query_time = None
        
        filtered_messages = []
        if query_time:
            # Pre-filter messages by timestamp to avoid processing irrelevant ones
            for message_id, message in connected_messages:
                msg_timestamp = message.get('timestamp')
                if msg_timestamp:
                    try:
                        msg_time = datetime.fromisoformat(msg_timestamp) if isinstance(msg_timestamp, str) else msg_timestamp
                        # Only include messages that happened before the query
                        if msg_time < query_time:
                            filtered_messages.append((message_id, message))
                    except:
                        filtered_messages.append((message_id, message))  # Include if timestamp parsing fails
                else:
                    filtered_messages.append((message_id, message))  # Include if no timestamp
        else:
            filtered_messages = connected_messages
        
        # Limit to most relevant messages for processing efficiency
        filtered_messages = filtered_messages[:15]
        print(f"  After temporal filtering: {len(filtered_messages)} messages")
        
        total_extracted_entities = 0
        total_extracted_decisions = 0
        total_extracted_questions = 0
        
        for message_id, message in filtered_messages:
            
            content = message.get('content', '')
            content_length = len(content)
            
            if self.use_llm_for_entities:
                # Use unified LLM extraction for all markers
                llm_markers = self._extract_all_markers_with_llm(content)
                
                if llm_markers:  # Only process if LLM extraction succeeded
                    entities_count = len(llm_markers['entities'])
                    decisions_count = len(llm_markers['key_decisions'])
                    questions_count = len(llm_markers['unresolved_questions'])
                    
                    print(f"    Message {message_id}: {content_length} chars -> {entities_count} entities, {decisions_count} decisions, {questions_count} questions")
                    
                    total_extracted_entities += entities_count
                    total_extracted_decisions += decisions_count
                    total_extracted_questions += questions_count
                    
                    # Update markers with LLM results and track sources
                    for entity in llm_markers['entities']:
                        markers['entities'][entity] += 1
                        markers['entity_sources'].append((entity, message_id))
                    
                    for expr in llm_markers['temporal_expressions']:
                        markers['temporal_expressions'].append(expr)
                        markers['temporal_sources'].append((expr, message_id))
                    
                    for action in llm_markers['user_actions']:
                        markers['user_actions'].append(action)
                        markers['action_sources'].append((action, message_id))
                    
                    for decision in llm_markers['key_decisions']:
                        markers['key_decisions'].append(decision)
                        markers['decision_sources'].append((decision, message_id))
                    
                    for question in llm_markers['unresolved_questions']:
                        markers['unresolved_questions'].append(question)
                        markers['question_sources'].append((question, message_id))
                    
                    for tool in llm_markers['mentioned_tools']:
                        markers['mentioned_tools'].append(tool)
                        markers['tool_sources'].append((tool, message_id))
                    
                    for deliverable in llm_markers['deliverable_sources']:
                        markers['deliverable_sources'].append(deliverable)
                        markers['deliverable_sources_with_ids'].append((deliverable, message_id))
                else:
                    print(f"    Message {message_id}: LLM extraction failed, falling back to regex")
                    # Fallback to regex if LLM fails
                    entities = self._extract_entities_regex(content)
                    total_extracted_entities += len(entities)
                    for entity in entities:
                        markers['entities'][entity] += 1
                        markers['entity_sources'].append((entity, message_id))
            else:
                # Use individual regex-based extraction methods
                entities = self._extract_entities_regex(content)
                entities_count = len(entities)
                
                decisions = self._extract_decisions(content)
                decisions_count = len(decisions)
                
                questions = self._extract_questions(content)
                questions_count = len(questions)
                
                print(f"    Message {message_id}: {content_length} chars -> {entities_count} entities, {decisions_count} decisions, {questions_count} questions (regex)")
                
                total_extracted_entities += entities_count
                total_extracted_decisions += decisions_count
                total_extracted_questions += questions_count
                
                for entity in entities:
                    markers['entities'][entity] += 1
                    markers['entity_sources'].append((entity, message_id))
                
                temporal = self._extract_temporal_expressions(content)
                for expr in temporal:
                    markers['temporal_expressions'].append(expr)
                    markers['temporal_sources'].append((expr, message_id))
                
                actions = self._extract_actions(content)
                for action in actions:
                    markers['user_actions'].append(action)
                    markers['action_sources'].append((action, message_id))
                
                decisions = self._extract_decisions(content)
                for decision in decisions:
                    markers['key_decisions'].append(decision)
                    markers['decision_sources'].append((decision, message_id))
                
                questions = self._extract_questions(content)
                for question in questions:
                    markers['unresolved_questions'].append(question)
                    markers['question_sources'].append((question, message_id))
                
                tools = self._extract_tools(content)
                for tool in tools:
                    markers['mentioned_tools'].append(tool)
                    markers['tool_sources'].append((tool, message_id))
                
                deliverables = self._extract_deliverable_sources(content)
                for deliverable in deliverables:
                    markers['deliverable_sources'].append(deliverable)
                    markers['deliverable_sources_with_ids'].append((deliverable, message_id))
            
            # Extract metadata (always the same)
            markers['metadata'].update({
                'author': message.get('author', ''),
                'timestamp': message.get('timestamp', ''),
                'message_type': message.get('message_type', '')
            })
        
        # Log extraction summary
        print(f"  Extraction Summary:")
        print(f"    Total entities: {len(markers['entity_sources'])}")
        print(f"    Total decisions: {len(markers['decision_sources'])}")
        print(f"    Total questions: {len(markers['question_sources'])}")
        print(f"    Total temporal expressions: {len(markers['temporal_sources'])}")
        print(f"    Total user actions: {len(markers['action_sources'])}")
        print(f"    Total tools mentioned: {len(markers['tool_sources'])}")
        print(f"    Total deliverable sources: {len(markers['deliverable_sources_with_ids'])}")
        
        # Add project context
        markers['project_context'] = {
            'project': target_node.get('project', ''),
            'topic': target_node.get('topic', '') if target_type == 'phase' else target_node.get('label', ''),
            'phase_name': target_node.get('phase_name', '') if target_type == 'phase' else '',
            'status': target_node.get('status', ''),
            'owner': target_node.get('owner', ''),
            'start_date': target_node.get('start_date', ''),
            'end_date': target_node.get('end_date', ''),
            'target_date': target_node.get('target_date', '')
        }
        
        # NEW: Collect ground truth messages - all messages from the target phase/topic
        markers['ground_truth_messages'] = self._collect_ground_truth_messages(target_node_id, target_type, query_timestamp)
        
        return markers
    
    def _collect_ground_truth_messages(self, target_node_id: int, target_type: str, query_timestamp: str = None) -> List[str]:
        """
        Collect all message IDs that should be considered as ground truth for the given target.
        Uses direct graph edges for efficient and accurate message collection.
        
        Args:
            target_node_id: ID of the target node (phase or topic)
            target_type: Type of target ('phase' or 'topic')
            query_timestamp: When the query is being made (filter messages before this time)
            
        Returns:
            List of message IDs that constitute the ground truth relevant message set
        """
        ground_truth_ids = []
        target_node = self.graph.nodes[target_node_id]
        
        print(f"  Collecting ground truth messages for {target_type} {target_node_id}")
        
        # Parse query timestamp for temporal filtering
        query_time = None
        if query_timestamp:
            try:
                query_time = datetime.fromisoformat(query_timestamp) if isinstance(query_timestamp, str) else query_timestamp
            except:
                query_time = None
        
        if target_type == 'phase':
            # For phases: Use direct message -> phase edges
            print(f"    Looking for messages directly connected to phase: {target_node_id}")
            
            # Find all messages that have edges to this phase
            connected_messages = []
            for source, target in self.graph.edges():
                if target == target_node_id and self.graph.nodes[source].get('type') == 'message':
                    message_data = self.graph.nodes[source]
                    connected_messages.append((source, message_data))
            
            print(f"    Found {len(connected_messages)} messages directly connected to phase")
            
        elif target_type == 'topic':
            # For topics: First get all phases connected to this topic, then get their messages
            target_topic_label = target_node.get('label', target_node_id)
            print(f"    Looking for phases connected to topic: {target_topic_label}")
            
            # Find all phases connected to this topic
            connected_phases = []
            for source, target in self.graph.edges():
                if target == target_node_id and self.graph.nodes[source].get('type') == 'phase':
                    connected_phases.append(source)
            
            # Alternative: find phases by topic attribute matching
            if not connected_phases:
                for node_id, node_data in self.graph.nodes(data=True):
                    if (node_data.get('type') == 'phase' and 
                        node_data.get('topic') == target_topic_label):
                        connected_phases.append(node_id)
            
            print(f"    Found {len(connected_phases)} phases connected to topic")
            
            # Now get all messages from these phases
            connected_messages = []
            for phase_id in connected_phases:
                for source, target in self.graph.edges():
                    if target == phase_id and self.graph.nodes[source].get('type') == 'message':
                        message_data = self.graph.nodes[source]
                        # Avoid duplicates
                        if not any(msg_id == source for msg_id, _ in connected_messages):
                            connected_messages.append((source, message_data))
            
            print(f"    Found {len(connected_messages)} total messages across topic phases")
        
        # Apply temporal filtering
        temporal_filtered_count = 0
        for message_id, message_data in connected_messages:
            if query_time:
                msg_timestamp = message_data.get('timestamp')
                if msg_timestamp:
                    try:
                        if isinstance(msg_timestamp, str):
                            msg_time = datetime.fromisoformat(msg_timestamp.replace('Z', '+00:00'))
                        else:
                            msg_time = msg_timestamp
                        
                        # Only include messages that occurred before the query time
                        if msg_time <= query_time:
                            msg_id = message_data.get('msg_node', message_data.get('id', str(message_id)))
                            ground_truth_ids.append(msg_id)
                            temporal_filtered_count += 1
                    except Exception as e:
                        # Include message if timestamp parsing fails
                        msg_id = message_data.get('msg_node', message_data.get('id', str(message_id)))
                        ground_truth_ids.append(msg_id)
                        temporal_filtered_count += 1
                else:
                    # Include messages without timestamps
                    msg_id = message_data.get('msg_node', message_data.get('id', str(message_id)))
                    ground_truth_ids.append(msg_id)
                    temporal_filtered_count += 1
            else:
                # No temporal filtering - include all connected messages
                msg_id = message_data.get('msg_node', message_data.get('id', str(message_id)))
                ground_truth_ids.append(msg_id)
                temporal_filtered_count += 1
        
        print(f"    After temporal filtering: {temporal_filtered_count} messages")
        print(f"  Ground truth collection complete: {len(ground_truth_ids)} message IDs")
        
        return ground_truth_ids
    
    def _extract_all_markers_with_llm(self, content: str) -> Dict:
        """Use LLM to extract all contextual markers from content in one call."""
        prompt = f"""
                Analyze the following message content and extract key information. Return a JSON object with the following structure:

                {{
                    "entities": ["list of key entities like projects, tools, concepts, people"],
                    "temporal_expressions": ["dates, deadlines, timelines mentioned"],
                    "user_actions": ["requests, suggestions, decisions, clarifications"],
                    "key_decisions": ["important decisions, owner assignment, target date, or conclusions made"],
                    "unresolved_questions": ["questions, concerns, blockers, issues raised"],
                    "mentioned_tools": ["technical tools, techniques, workflows mentioned"],
                    "deliverable_sources": ["URLs, links, file paths, document references, attachments mentioned"]
                }}

                Focus on:
                - Entities: Project names, technical concepts, important topics, people/roles
                - Temporal: Specific dates, deadlines, milestones, timeline references
                - Actions: What people are asking for, proposing, deciding
                - Decisions: Conclusions reached, choices made, approvals given, owner assigned, decided dates
                - Questions: Open issues, concerns, blockers, uncertainties
                - Tools: Technical tools, software, platforms, systems
                - Deliverable Sources: URLs (http/https), file paths, document names, shared drive links, attachments, references to reports or deliverables

                Message content:
                {content}

                Return only valid JSON:
            """
        
        try:
            response = call_llm(prompt, self.client)
            if not response.startswith('[Error'):
                # Try to parse JSON response
                import json
                markers = json.loads(response)
                
                # Validate and clean the response
                cleaned_markers = {
                    'entities': [e.strip() for e in (markers.get('entities') or []) if e and e.strip() and len(e.strip()) > 2],
                    'temporal_expressions': [t.strip() for t in (markers.get('temporal_expressions') or []) if t and t.strip()],
                    'user_actions': [a.strip() for a in (markers.get('user_actions') or []) if a and a.strip()],
                    'key_decisions': [d.strip() for d in (markers.get('key_decisions') or []) if d and d.strip()],
                    'unresolved_questions': [q.strip() for q in (markers.get('unresolved_questions') or []) if q and q.strip()],
                    'mentioned_tools': [t.strip() for t in (markers.get('mentioned_tools') or []) if t and t.strip()],
                    'deliverable_sources': [d.strip() for d in (markers.get('deliverable_sources') or []) if d and d.strip()]
                }
                
                return cleaned_markers
                
        except json.JSONDecodeError as e:
            print(f"JSON parsing error in LLM marker extraction: {e}")
        except Exception as e:
            print(f"Error in LLM marker extraction: {e}")
        
        return None
    
    def _extract_entities_regex(self, content: str) -> List[str]:
        """Extract entities using regex patterns (fallback when LLM is not used)."""
        entities = []
        
        # Look for project names, technical terms, proper nouns
        patterns = [
            r'\b[A-Z][a-zA-Z]*[A-Z][a-zA-Z]*\b',  # CamelCase
            r'\b[A-Z]{2,}\b',  # UPPERCASE acronyms
            r'Project\s+[A-Z]\w+',  # Project names
            r'Team\s+[A-Z]\w+',  # Team names
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content)
            entities.extend(matches)
        
        return list(set(entities))
    
    def _extract_temporal_expressions(self, content: str) -> List[str]:
        """Extract temporal expressions using regex patterns."""
        temporal = []
        
        patterns = [
            r'\b\d{1,2}/\d{1,2}/\d{4}\b',  # MM/DD/YYYY
            r'\b\d{4}-\d{2}-\d{2}\b',  # YYYY-MM-DD
            r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{1,2},?\s+\d{4}\b',  # Month DD, YYYY
            r'\b(?:today|tomorrow|yesterday|next week|last week|this week)\b',  # Relative dates
            r'\bdeadline|due date|target date\b',  # Deadline keywords
            r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\b'  # Times
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            temporal.extend(matches)
        
        return list(set(temporal))
    
    def _extract_actions(self, content: str) -> List[str]:
        """Extract user actions using regex patterns."""
        actions = []
        
        patterns = [
            r'\b(?:I|we|they)\s+(?:need|want|should|will|plan to|going to)\s+[^.!?]*',
            r'\b(?:please|can you|could you|would you)\s+[^.!?]*',
            r'\b(?:request|asking|suggesting|proposing)\s+[^.!?]*',
            r'\blet\'s\s+[^.!?]*'
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            actions.extend([match.strip() for match in matches])
        
        return list(set(actions))
    
    def _extract_decisions(self, content: str) -> List[str]:
        """Extract key decisions using regex patterns.""" 
        decisions = []
        
        patterns = [
            r'\b(?:decided|concluded|determined|agreed|approved)\s+[^.!?]*',
            r'\b(?:we will|we\'ll|final decision|outcome)\s+[^.!?]*',
            r'\b(?:resolution|verdict|choice made)\s+[^.!?]*'
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            decisions.extend([match.strip() for match in matches])
        
        return list(set(decisions))
    
    def _extract_questions(self, content: str) -> List[str]:
        """Extract unresolved questions using regex patterns."""
        questions = []
        
        # Direct questions
        question_sentences = re.findall(r'[^.!?]*\?', content)
        questions.extend(question_sentences)
        
        # Uncertainty expressions
        uncertainty_patterns = [
            r'\b(?:not sure|unclear|uncertain|confused)\s+[^.!?]*',
            r'\b(?:issue|problem|concern|blocker)\s+[^.!?]*',
            r'\b(?:how|what|why|when|where|which)\s+[^.!?]*'
        ]
        
        for pattern in uncertainty_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            questions.extend([match.strip() for match in matches])
        
        return list(set(questions))
    
    def _extract_tools(self, content: str) -> List[str]:
        """Extract mentioned tools and technologies using regex patterns."""
        tools = []
        
        # Common tool/technology patterns
        patterns = [
            r'\b(?:Python|JavaScript|Java|C\+\+|TypeScript|Go|Rust)\b',
            r'\b(?:Docker|Kubernetes|Jenkins|GitHub|GitLab|AWS|Azure|GCP)\b',
            r'\b(?:React|Angular|Vue|Node\.js|Express|Flask|Django)\b',
            r'\b(?:PostgreSQL|MySQL|MongoDB|Redis|Elasticsearch)\b',
            r'\b(?:VS Code|Visual Studio|IntelliJ|PyCharm|Sublime)\b'
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            tools.extend(matches)
        
        return list(set(tools))
    
    def _extract_deliverable_sources(self, content: str) -> List[str]:
        """Extract deliverable sources like URLs, file paths, document references using regex patterns."""
        deliverables = []
        
        patterns = [
            r'https?://[^\s\]]+',  # URLs
            r'www\.[^\s\]]+',  # www URLs
            r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',  # Email addresses
            r'[A-Za-z]:\\[^\\/:*?"<>|\s]+(?:\\[^\\/:*?"<>|\s]+)*',  # Windows file paths
            r'/[^\s\]]+(?:/[^\s\]]+)*',  # Unix file paths
            r'[\w\-\.]+\.(?:pdf|doc|docx|xls|xlsx|ppt|pptx|txt|md|json|xml|csv)',  # File extensions
            r'(?:attachment|document|file|link|reference):\s*[^\s\]]+',  # Explicit references
            r'see\s+(?:attachment|document|file|link)\s*[^\s\]]*',  # "see attachment" patterns
            r'(?:github|gitlab)\.com/[^\s\]]+',  # Repository links
            r'(?:sharepoint|onedrive|dropbox|drive\.google)\.com/[^\s\]]+',  # Cloud storage
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            deliverables.extend([match.strip() for match in matches])
        
        return list(set(deliverables))
    
    
    def get_user_persona(self, user_id: str) -> Dict:
        """Get user persona information."""
        # First try to find by node ID (user_id is the node ID)
        if user_id in self.graph.nodes:
            node_data = self.graph.nodes[user_id]
            if node_data.get('type') == 'user':
                return node_data.get('persona', {})
        
        # Fallback: search by label attribute
        for node_id, node_data in self.graph.nodes(data=True):
            if node_data.get('label') == user_id and node_data.get('type') == 'user':
                return node_data.get('persona', {})
        return {}
    
    def _sample_query_timestamp(self, target_node_id: int, target_type: str, user_involvement: Dict) -> str:
        """
        Sample a realistic timestamp for when a user would make a query.
        
        Args:
            target_node_id: ID of target node
            target_type: 'phase' or 'topic'
            user_involvement: User's involvement in domains/topics/phases
            
        Returns:
            ISO timestamp string for when the query is made
        """
        target_node = self.graph.nodes[target_node_id]
        
        if target_type == 'phase':
            # For phases, sample timestamp during or after the phase
            start_date = target_node.get('start_date')
            end_date = target_node.get('end_date')
            
            if start_date and end_date:
                start = datetime.fromisoformat(start_date)
                end = datetime.fromisoformat(end_date)
                
                # 60% chance during phase, 30% chance shortly after, 10% chance well after
                rand = random.random()
                if rand < 0.85:
                    # During the phase (20%-80% through)
                    phase_duration = end - start
                    progress = random.uniform(0.2, 0.7)
                    query_time = start + phase_duration * progress
                elif rand < 0.99:
                    # Shortly after phase ends (0-7 days)
                    days_after = random.uniform(0, 7)
                    query_time = end + timedelta(days=days_after)
                else:
                    # Well after phase ends (1-30 days)
                    days_after = random.uniform(7, 30)
                    query_time = end + timedelta(days=days_after)
                
                return query_time.isoformat()
        
        # For topics or if no phase dates, use current time range
        base_date = datetime(2025, 6, 19)  # Base project start date from main.py
        days_offset = random.randint(0, 180)  # 6 months range
        query_time = base_date + timedelta(days=days_offset)
        
        return query_time.isoformat()
    
    def generate_intent_schema(self, contextual_markers: Dict, document_type: str, user_profile: Dict, target_type: str) -> Dict:
        """
        Generate a structured intent schema that defines what the user wants in the document.
        
        Args:
            contextual_markers: Extracted contextual information from graph traversal
            document_type: Type of document ('status_report', 'email', 'faq')
            user_profile: User profile information
            target_type: Type of target ('phase' or 'topic')
            
        Returns:
            Structured intent schema dictionary
        """
        # Define document structure sections based on document type
        if document_type == 'status_report':
            section_pool = [
                'executive_summary', 'project_overview', 'timeline_and_milestones',
                'progress_highlights', 'key_achievements', 'completed_deliverables',
                'current_phase_status', 'upcoming_deadlines', 'next_steps',
                'action_items', 'challenges_and_blockers', 'risks_and_mitigation',
                'budget_status', 'resource_allocation', 'team_performance',
                'quality_metrics', 'stakeholder_feedback', 'compliance_status',
                'lessons_learned', 'dependencies', 'change_requests',
                'technical_architecture', 'testing_results', 'deployment_status'
            ]
        elif document_type == 'email':
            section_pool = [
                'summary_update', 'key_decisions_made', 'action_items',
                'next_steps', 'timeline_updates', 'milestone_achievements',
                'urgent_matters', 'blockers_requiring_attention', 'stakeholder_updates',
                'budget_implications', 'resource_needs', 'meeting_outcomes',
                'deliverable_status', 'risk_alerts', 'approvals_needed',
                'schedule_changes', 'team_announcements', 'technical_updates',
                'compliance_notes', 'feedback_requests'
            ]
        else:  # faq
            section_pool = [
                'getting_started', 'common_questions', 'troubleshooting_guide',
                'best_practices', 'process_workflows', 'system_requirements',
                'access_and_permissions', 'contact_information', 'escalation_procedures',
                'technical_specifications', 'integration_guidelines', 'security_policies',
                'compliance_requirements', 'training_resources', 'known_issues',
                'feature_explanations', 'configuration_steps', 'performance_tips',
                'maintenance_procedures', 'reporting_guidelines'
            ]
        
        # Sample 3-6 sections for document structure
        import random
        num_sections = random.randint(3, 6)
        document_structure = random.sample(section_pool, min(num_sections, len(section_pool)))
        
        project_context = contextual_markers.get('project_context', {})
        
        # Generate using LLM for realistic intent
        prompt = f"""
        Based on the user profile and project context, generate a realistic intent schema for a {document_type} request.
        
        User Profile:
        - Role: {user_profile.get('role', 'Team Member')}
        - Expertise: {user_profile.get('expertise_level', 'intermediate')}
        - Communication Style: {user_profile.get('communication_style', 'standard')}
        - Tone: {user_profile.get('tone', 'professional')}
        
        Project Context:
        - Project: {project_context.get('project', 'Unknown')}
        - Topic: {project_context.get('topic', 'Unknown')}
        - Phase: {project_context.get('phase_name', 'Unknown')}
        - Target Type: {target_type}
        
        Required Document Structure: {document_structure}
        Create a detailed intent schema with the following fields:
            1. document_type: The type of document to be generated (status_report, email, faq).
            2. target_audience: One of [executives, team_members, stakeholders, management, clients, board].
            3. temporal_scope: One of [last_week, past_month, quarter, project_start, ongoing, upcoming, last_two_weeks].
            4. detail_level: One of [summary, detailed, comprehensive, high_level].
            5. tone: One of [formal, technical, conversational, executive, urgent, celebratory, accessible].
            6. visual_elements: A list of visual elements to include in the document. Choose from [charts_and_graphs, timeline_visuals, progress_bars, status_tables, dashboard_format, traffic_light_indicators].
            7. format_instruction: A one-sentence instruction on how to format the document. E.g., "Use bullet points for action items and bold headings for sections."
            8. document_structure: The sections to include in the document, based on the required structure provided above.
            9. special_instruction: Any specific requirements, constraints, or preferences for the document content and style.
        
        Generate a JSON intent schema with this exact structure:
        {{
            "document_type": "{document_type}",
            "target_audience": "executives|team_members|stakeholders|management|clients|board",
            "temporal_scope": "last_week|past_month|quarter|project_start|ongoing|upcoming|last_two_weeks",
            "detail_level": "summary|detailed|comprehensive|high_level",
            "tone": "formal|technical|conversational|executive|urgent|celebratory|accessible",
            "visual_elements": ["charts_and_graphs", "timeline_visuals", "progress_bars", "status_tables", "dashboard_format", "traffic_light_indicators"],
            "format_instruction": "One sentence describing how to organize the document structure and presentation",
            "document_structure": {document_structure},
            "special_instruction": "Any specific requirements, constraints, or preferences for the document content and style"
        }}

        Make the intent realistic for the user's role and document type. Return only valid JSON:
        """
        
        try:
            response = call_llm(prompt, self.client)
            if not response.startswith('[Error'):
                intent = json.loads(response)
                
                # Ensure document_structure is included
                if 'document_structure' not in intent:
                    intent['document_structure'] = document_structure
                
                # Validate and add any missing required fields with defaults
                required_defaults = {
                    'document_type': document_type,
                    'target_audience': 'team_members',
                    'temporal_scope': 'ongoing',
                    'detail_level': 'detailed',
                    'tone': 'professional',
                    'visual_elements': [],
                    'format_instruction': f"Organize the {document_type} in a clear, professional format with logical section flow.",
                    'document_structure': document_structure,
                    'special_instruction': "Follow standard workplace communication practices and ensure all content is factual and well-cited."
                }
                
                for field, default_value in required_defaults.items():
                    if field not in intent:
                        intent[field] = default_value
                
                return intent
                
        except json.JSONDecodeError as e:
            print(f"JSON parsing error in intent generation: {e}")
        except Exception as e:
            print(f"Error in intent generation: {e}")
        
        # Fallback intent with required fields
        return {
            "document_type": document_type,
            "target_audience": "team_members",
            "temporal_scope": "ongoing",
            "detail_level": "detailed", 
            "tone": "professional",
            "visual_elements": [],
            "format_instruction": f"Organize the {document_type} in a clear, professional format with logical section flow.",
            "document_structure": document_structure,
            "special_instruction": "Follow standard workplace communication practices and ensure all content is factual and well-cited."
        }
    
    def generate_query(self, target_node_id: int, target_type: str, document_type: str, user_id: str = None, query_timestamp: str = None) -> Dict:
        """
        Generate a synthetic user query for document generation using LLM.
        
        Args:
            target_node_id: ID of target node (phase or topic)
            target_type: 'phase' or 'topic'
            document_type: 'status_report', 'email', or 'faq'
            user_id: Optional user ID for persona-based generation
            query_timestamp: When the user is making this query request
            
        Returns:
            Dictionary containing the generated query and metadata
        """
        # Extract contextual markers with temporal filtering
        markers = self.extract_contextual_markers(target_node_id, target_type, query_timestamp)
        
        # Check if we have ground truth messages - skip if none found
        if not markers.get('ground_truth_messages'):
            print(f"    Skipping query generation - no ground truth messages found for {target_type} {target_node_id}")
            return None
        
        # Get user persona if provided
        persona = self.get_user_persona(user_id) if user_id else {
            'role': 'Team Member',
            'tone': 'professional',
            'style': 'standard',
            'expertise': 'intermediate'
        }
        
        # Convert persona to user profile format for intent generation
        user_profile = {
            'role': persona.get('role', 'Team Member'),
            'expertise_level': persona.get('expertise', 'intermediate'),
            'communication_style': persona.get('style', 'standard'),
            'tone': persona.get('tone', 'professional')
        }
        
        # Generate intent schema
        intent_schema = self.generate_intent_schema(markers, document_type, user_profile, target_type)
        
        # Generate prompt for LLM using intent schema
        prompt = self.generate_query_prompt(intent_schema, markers, persona)
        
        # Get query from LLM
        query = call_llm(prompt, self.client)
        
        # Clean up the query
        if query.startswith('[Error'):
            # Fallback to simple template if LLM fails
            context = markers['project_context']
            if document_type == 'status_report':
                query = f"Can you generate a status report for the {context.get('phase_name', 'current phase')}?"
            elif document_type == 'email':
                query = f"I need an email summarizing our recent progress on {context.get('topic', 'the current topic')}"
            else:  # faq
                query = f"Can we create an FAQ document about {context.get('topic', 'the current topic')}?"
        
        # Prepare contextual markers with proper source tracking for evaluation
        contextual_markers_for_output = {
            'entities': markers['entity_sources'],  # Use source arrays instead of counts
            'temporal_expressions': markers['temporal_sources'],
            'user_actions': markers['action_sources'],
            'metadata': markers['metadata'],
            'key_decisions': markers['decision_sources'],
            'unresolved_questions': markers['question_sources'],
            'mentioned_tools': markers['tool_sources'],
            'deliverable_sources': markers['deliverable_sources_with_ids'],
            'project_context': markers['project_context'],
            'ground_truth_messages': markers['ground_truth_messages']  # Add ground truth message IDs
        }
        
        return {
            'query': query,
            'document_type': document_type,
            'target_type': target_type,
            'target_node_id': target_node_id,
            'user_id': user_id,
            'query_timestamp': query_timestamp,
            'persona': persona,
            'intent': intent_schema,  # Add the intent schema
            'contextual_markers': contextual_markers_for_output,
            'generated_at': datetime.now().isoformat()
        }
    
    def generate_queries_for_graph(self, num_queries_per_type: int = 5) -> List[Dict]:
        """
        Generate multiple queries following a user-centric approach:
        1) Sample target user
        2) Identify domains, topics, phases that user is involved in
        3) Sample from user's domains/topics/phases
        4) Pull contextual information for the target user
        5) Use contextual information and user profile to synthesize query
        
        Args:
            num_queries_per_type: Number of queries to generate per document type
            
        Returns:
            List of generated query dictionaries
        """
        queries = []
        
        # Get all user nodes
        user_nodes = [(nid, data) for nid, data in self.graph.nodes(data=True) 
                     if data.get('type') == 'user']
        
        if not user_nodes:
            print("No users found in graph!")
            return queries
        
        document_types = ['status_report', 'email', 'faq']
        total_queries_needed = num_queries_per_type * 2  # phases + topics
        
        query_idx = 0
        attempts = 0
        max_attempts = total_queries_needed * 3  # Allow more attempts to handle skipped users
        
        while len(queries) < total_queries_needed and attempts < max_attempts:
            attempts += 1
            try:
                # Step 1: Sample target user
                user_node_id, user_data = random.choice(user_nodes)
                # In this graph, user_node_id IS the user label (like 'User_1')
                user_label = user_node_id  # The node ID is the user identifier
                user_persona = user_data.get('persona', {})
                
                print(f"Generating query {len(queries) + 1}/{total_queries_needed} for user {user_label} (attempt {attempts})")
                
                # Step 2: Identify domains, topics, phases that user is involved in
                user_involvement = self._get_user_involvement(user_node_id)
                
                print(f"User involvement: {len(user_involvement['domains'])} domains, {len(user_involvement['topics'])} topics, {len(user_involvement['phases'])} phases")
                
                if not user_involvement['phases'] and not user_involvement['topics']:
                    print(f"User {user_label} has no involvement in phases or topics, trying different user...")
                    continue
                
                # Step 3: Sample from user's domains/topics/phases
                # Decide whether to generate for phase or topic
                available_phases = user_involvement['phases']
                available_topics = user_involvement['topics']
                
                if available_phases and available_topics:
                    # Both available, choose randomly with preference for phases for status reports
                    if len(queries) < num_queries_per_type:  # First half - prefer phases
                        target_type = 'phase' if available_phases else 'topic'
                    else:  # Second half - prefer topics
                        target_type = 'topic' if available_topics else 'phase'
                elif available_phases:
                    target_type = 'phase'
                elif available_topics:
                    target_type = 'topic'
                else:
                    print(f"User {user_label} has no available phases or topics, trying different user...")
                    continue
                
                # Sample target node
                if target_type == 'phase':
                    target_node_id = random.choice(available_phases)
                    # Status reports are most common for phases
                    doc_type = random.choices(['status_report', 'email'], weights=[0.8, 0.2])[0]
                else:  # topic
                    target_node_id = random.choice(available_topics)
                    # Emails and FAQs are more common for topics
                    doc_type = random.choices(['email', 'faq', 'status_report'], weights=[0.5, 0.4, 0.1])[0]
                
                # Step 4: Sample when the user makes this query
                query_timestamp = self._sample_query_timestamp(target_node_id, target_type, user_involvement)
                
                # Step 5: Pull contextual information and generate query with temporal context
                query_result = self.generate_query(target_node_id, target_type, doc_type, user_label, query_timestamp)
                
                # Skip if no ground truth messages were found
                if query_result is None:
                    print(f"    Skipping - no ground truth messages for target {target_node_id}")
                    continue
                
                # Override the persona with the actual user data we already have
                query_result['persona'] = user_persona
                query_result['user_id'] = user_label
                
                # Add user involvement context to the result
                query_result['user_involvement'] = user_involvement
                queries.append(query_result)
                
                print(f"Successfully generated query {len(queries)}/{total_queries_needed}")
                
            except Exception as e:
                print(f"Error generating query (attempt {attempts}): {e}")
                continue
        
        if len(queries) < total_queries_needed:
            print(f"Warning: Only generated {len(queries)} out of {total_queries_needed} requested queries after {attempts} attempts")
        else:
            print(f"Successfully generated all {len(queries)} queries in {attempts} attempts")
        
        return queries
    
    def _get_user_involvement(self, user_node_id: str) -> Dict[str, List[int]]:
        """
        Get all domains, topics, and phases that a user is involved in.
        Uses direct graph edges for efficient and accurate traversal.
        
        Args:
            user_node_id: The node ID of the user
            
        Returns:
            Dictionary with lists of node IDs for domains, topics, and phases
        """
        involvement = {
            'domains': [],
            'topics': [],
            'phases': []
        }
        
        print(f"  Getting user involvement for: {user_node_id}")
        
        # Step 1: Get all phases where user is active using 'active_in_phase' edges
        for target_node in self.graph.successors(user_node_id):
            edge_data = self.graph[user_node_id][target_node]
            if edge_data.get('relation') == 'active_in_phase':
                target_data = self.graph.nodes[target_node]
                if target_data.get('type') == 'phase':
                    involvement['phases'].append(target_node)
        
        print(f"    Found {len(involvement['phases'])} phases via active_in_phase edges")
        
        # Step 2: Get all topics from phases using 'has_phase' edges (phase -> topic)
        topic_set = set()
        for phase_id in involvement['phases']:
            for target_node in self.graph.predecessors(phase_id):
                edge_data = self.graph[target_node][phase_id]
                if edge_data.get('relation') == 'has_phase':
                    target_data = self.graph.nodes[target_node]
                    if target_data.get('type') == 'topic':
                        topic_set.add(target_node)
        
        involvement['topics'] = list(topic_set)
        print(f"    Found {len(involvement['topics'])} topics via has_phase edges")
        
        # Step 3: Get all domains from topics using 'has_topic' edges (domain -> topic)
        domain_set = set()
        for topic_id in involvement['topics']:
            for target_node in self.graph.predecessors(topic_id):
                edge_data = self.graph[target_node][topic_id]
                if edge_data.get('relation') == 'has_topic':
                    target_data = self.graph.nodes[target_node]
                    if target_data.get('type') == 'domain':
                        domain_set.add(target_node)
        
        involvement['domains'] = list(domain_set)
        print(f"    Found {len(involvement['domains'])} domains via has_topic edges")
        
        return involvement
    
    def save_queries(self, queries: List[Dict], output_path: str):
        """Save generated queries to a JSON file."""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(queries, f, indent=2, ensure_ascii=False)


def main():
    """Main function to demonstrate the query generation."""
    # Path to the domain graph
    domain = "Manufacturing"  # Change as needed: "Finance", "Healthcare", "Manufacturing", "Technology"
    graph_path = f"../data/{domain}/synthetic_domain_channels_graph_{domain}.gml"
    
    # For demo purposes, let's use LLM-based entity extraction too
    use_llm_entities = True  # Set to True to use LLM-based entity extraction
    
    # Initialize the generator
    print(f"Initializing generator with {'LLM-based' if use_llm_entities else 'graph-based'} entity extraction...")
    generator = UserQueryGenerator(graph_path, use_llm_for_entities=use_llm_entities)
    
    # Generate queries
    print("Generating synthetic user queries...")
    queries = generator.generate_queries_for_graph(num_queries_per_type=20)
    
    # Display sample queries
    print(f"\nGenerated {len(queries)} queries using user-centric approach:")
    print("=" * 80)
    
    for i, query_data in enumerate(queries[:5]):
        print(f"\n{i+1}. User: {query_data.get('user_id', 'Unknown')}")
        print(f"   Document Type: {query_data['document_type']}")
        print(f"   Target: {query_data['target_type']} (Node ID: {query_data['target_node_id']})")
        print(f"   Query Timestamp: {query_data.get('query_timestamp', 'N/A')}")
        print(f"   Query: {query_data['query']}")
        
        if query_data.get('persona'):
            persona = query_data['persona']
            print(f"   Persona: {persona.get('role', 'N/A')} ({persona.get('tone', 'N/A')} tone, {persona.get('style', 'N/A')} style)")
        
        # Show user involvement context
        involvement = query_data.get('user_involvement', {})
        if involvement:
            print(f"   User Involvement:")
            print(f"     - Domains: {len(involvement.get('domains', []))} domains")
            print(f"     - Topics: {len(involvement.get('topics', []))} topics") 
            print(f"     - Phases: {len(involvement.get('phases', []))} phases")
        
        # Show extracted entities for context
        entities = query_data.get('contextual_markers', {}).get('entities', [])
        if entities:
            # entities is now a list of [entity, message_id] pairs
            entity_names = [entity[0] for entity in entities[:3]]
            print(f"   Key entities: {entity_names}")
        
        # Show project context
        project_context = query_data.get('contextual_markers', {}).get('project_context', {})
        if project_context.get('project'):
            print(f"   Project Context: {project_context.get('project')} - {project_context.get('topic', 'N/A')}")
            if project_context.get('phase_name'):
                print(f"     Phase: {project_context.get('phase_name')} (Status: {project_context.get('status', 'N/A')})")
                
        # Show temporal context  
        if query_data.get('query_timestamp'):
            query_timestamp = query_data['query_timestamp']
            try:
                query_time = datetime.fromisoformat(query_timestamp)
                print(f"   Query Timestamp: {query_time.strftime('%Y-%m-%d %H:%M')}")
            except:
                print(f"   Query Timestamp: {query_timestamp}")
    
    # Save queries to file
    output_path = f"./generated_user_queries_{domain}.json"
    generator.save_queries(queries, output_path)
    print(f"\nAll queries saved to: {output_path}")
    
    # Show entity extraction comparison for the first query if available
    if queries:
        print("\n" + "=" * 80)
        print("Entity Extraction Comparison (first query):")
        first_query = queries[0]
        markers = first_query.get('contextual_markers', {})
        print(f"Extraction method used: {'LLM-based' if use_llm_entities else 'Graph-based'}")
        # markers['entities'] is now a list of [entity, message_id] pairs
        entity_sources = markers.get('entities', [])
        if entity_sources:
            entity_list = [f"{entity[0]} (from {entity[1]})" for entity in entity_sources[:5]]
            print(f"Entities found: {entity_list}")
        else:
            print("Entities found: []")


if __name__ == "__main__":
    
    import ctypes
    ES_CONTINUOUS = 0x80000000
    ES_SYSTEM_REQUIRED = 0x00000001
    ES_DISPLAY_REQUIRED = 0x00000002

    ctypes.windll.kernel32.SetThreadExecutionState(
        ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_DISPLAY_REQUIRED)

    main()