import argparse
import os
import time
import pandas as pd
import datetime
import re
from video_game_builder import (
    evaluate_content, 
    setup_model, 
    get_system_prompt, 
    extract_scores_from_evaluations,
    review_evaluations_with_coding_agent
)
from prompts import get_description_feedback
from relative_eval import evaluate_relative
from utils import process_csv_dataset, update_csv_with_results, update_csv_with_relative_results

def parse_arguments():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Evaluate content from multiple folders")
    
    # Folders to evaluate
    parser.add_argument("--folders", type=str, nargs="+", required=True,
                        help="List of folders to evaluate")
    parser.add_argument("--file", type=str, default="final_content", help="filename")
    parser.add_argument("--folders_paired", type=str, nargs="+", default=None,
                        help="List of folders to compare with the main folders (must have same length as --folders)")
    parser.add_argument("--file_paired", type=str, default="final_content", help="filename")

    # Content settings
    parser.add_argument("--content_type", type=str, choices=["video-game", "animation", "website"], default="video-game",
                        help="Type of content to evaluate (video-game, animation, or website)")
    parser.add_argument("--content_description", type=str, default="",
                        help="Description of the content to be evaluated")

    # Model settings
    parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-VL-32B-Instruct",
                        help="Path to the VLM model")
    parser.add_argument("--use_vllm_server", action="store_true",
                        help="Use VLLM server instead of loading model locally")
    parser.add_argument("--vllm_server_url", type=str, default="http://localhost:8000",
                        help="URL of the VLLM server (default: http://localhost:8000)")
    parser.add_argument("--api_key", type=str, default="token-abc123",
                        help="API key for the VLLM server (default: token-abc123)")
    
    parser.add_argument("--server_timeout", type=int, default=60000, # 1000min timeout
                        help="Timeout time for queries on a vllm server")

    # Separate evaluator model settings
    parser.add_argument("--use_separate_evaluator", action="store_true",
                        help="Use separate models for coding and evaluation")
    parser.add_argument("--evaluator_model_path", type=str, 
                        default="Qwen/Qwen2.5-VL-32B-Instruct",
                        help="Path to the evaluator model (must support video)")
    parser.add_argument("--evaluator_vllm_server_url", type=str, default=None,
                        help="URL of the VLLM server for the evaluator model (defaults to --vllm_server_url if not specified)")
    parser.add_argument("--evaluator_api_key", type=str, default=None,
                        help="API key for the VLLM server for the evaluator model (defaults to --api_key if not specified)")
    
    # Evaluation settings
    parser.add_argument("--enable_audio", action="store_true",
                        help="Enable audio processing")
    parser.add_argument("--seed", type=int, default=42,
                        help="Random seed for reproducibility (default: 42)")
    parser.add_argument("--sampling_rate", type=int, default=44100,
                        help="Sampling rate for audio")
    
    # Generation settings
    parser.add_argument("--generation_max_tokens", type=int, default=None,
                        help="Maximum number of tokens for content generation and improvement")
    parser.add_argument("--feedback_max_tokens", type=int, default=3200,
                        help="Maximum number of tokens for content evaluation")
    parser.add_argument("--evaluation_max_tokens", type=int, default=3200,
                        help="Maximum number of tokens for content evaluation")

    # Temperature settings
    parser.add_argument("--temp_coding", type=float, default=0.0,
                        help="Temperature for initial content and improvement steps")
    parser.add_argument("--top_p", type=float, default=0.95,
                        help="")
    parser.add_argument("--top_k", type=int, default=-1,
                        help="")
    parser.add_argument("--repetition_penalty", type=float, default=1.00,
                        help="")

    # Relative evaluation settings
    parser.add_argument("--relative", action="store_true", default=True,
                        help="Use relative evaluation when folders_paired is provided (default: True). When False, performs single evaluations on both folders and compares results.")
    parser.add_argument("--multiround", action="store_true",
                        help="If True, use multiround description, then comparision for the relative evaluation")

    # Coding evaluation settings
    parser.add_argument("--coding_evaluation", action="store_true",
                        help="If True, show all evaluations to the coding agent for review instead of averaging scores")
    
    # Claude API settings
    parser.add_argument("--claude_api_key", type=str, default=None,
                        help="Claude API key (if not set, will use CLAUDE_API_KEY environment variable)")
    parser.add_argument("--claude_model", type=str, default="claude-3-opus-20240229",
                        help="Claude model to use (default: claude-3-opus-20240229)")
    
    # Local VLM settings
    parser.add_argument("--vllm_gpu_memory_utilization", type=float, default=0.8,
                        help="GPU memory utilization for VLLM")
    parser.add_argument("--tensor_parallel_size", type=int, default=1,
                        help="Tensor parallel size for VLLM")
    
    # Debug settings
    parser.add_argument("--debug", action="store_true",
                        help="Enable debug mode to save and evaluate the content at every iteration")

    # Separate feedbacks
    parser.add_argument("--separate_code_feedback", action="store_true",
                        help="Enable separate code and omni feedback (Technical Implementation vs. other aspects)")
    parser.add_argument("--description_feedback", action="store_true",
                        help="Ask the feedback agent to describe the video and audio before providing feedback")

    # Output settings
    parser.add_argument("--output_dir", type=str, default=None,
                        help="Directory to save evaluation outputs (default: save to input folders)")

    # CSV dataset settings
    parser.add_argument("--dataset", type=str, default=None,
                        help="Path to a CSV file to use for content description and result storage")
    parser.add_argument("--row_index", type=int, default=None,
                        help="Row index in the CSV file to use (between 1 and 50)")
    parser.add_argument("--name_is_output_dir", action="store_true",
                        help="If True, the score saved will use the name of the current folder")

    parser.add_argument("--without_AI", action="store_true",
                        help="If True, does not evaluate based on AI (necessary for comparing real game vs fake game)")

    return parser.parse_args()

def compare_single_evaluations(args, scores1, scores2, folder1, folder2, coding_model, system_prompt, current_seed, df=None, with_AI=True):
    """
    Compare results from two single evaluations and determine which is better.
    
    Args:
        args: Command line arguments
        scores1: Scores from first folder evaluation
        scores2: Scores from second folder evaluation
        folder1: First folder name
        folder2: Second folder name
        coding_model: The coding model for summarization (if coding_evaluation=True)
        system_prompt: System prompt for the coding model
        current_seed: Current random seed
        df: DataFrame from CSV dataset (optional)
    
    Returns:
        Tuple containing comparison results and updated seed
    """
    print(f"\n{'='*50}")
    print(f"Comparing single evaluations: {folder1} vs {folder2}")
    print(f"{'='*50}")
    
    comparison_results = {}
    
    if args.coding_evaluation:
        # Use coding agent to summarize the raw evaluations and make a conclusion
        print("Using coding agent to summarize single evaluations...")
        
        # Create a summary of both evaluations for the coding agent
        folder1_name = os.path.basename(os.path.normpath(folder1))
        folder2_name = os.path.basename(os.path.normpath(folder2))
        
        # Format the scores for presentation to the coding agent
        scores1_text = "\n".join([f"{key}: {value}" for key, value in scores1.items()])
        scores2_text = "\n".join([f"{key}: {value}" for key, value in scores2.items()])
        
        prompt = f"""You are comparing two different implementations of the same {args.content_type} based on their individual evaluation scores.

Original description of the {args.content_type}: {args.content_description}

CONTENT A SCORES (from {folder1_name}):
{scores1_text}

CONTENT B SCORES (from {folder2_name}):
{scores2_text}

Based on these evaluation scores, please determine which content is better overall and provide a detailed analysis.

The evaluation criteria for {args.content_type} are typically:
- Description Fidelity: How well does the content match the original description?
- Visual Design: Quality of visual elements, aesthetics, and presentation
"""
        if args.content_type == "video-game":
            if with_AI:
                prompt += """- Gameplay Quality: Fun factor, game mechanics, and user engagement
- AI Player Quality: Intelligence and behavior of AI-controlled elements
"""
            else:
                prompt += """- Gameplay Quality: Fun factor, game mechanics, and user engagement
"""
        elif args.content_type == "animation":
            prompt += """- Animation Smoothness: Quality of motion and transitions
- Creativity: Originality and creative elements
"""
        else:  # website
            prompt += """- User Experience: Ease of use and navigation
- Functionality: How well features work
"""
        
        prompt += """- Behavior Correctness: Technical correctness and bug-free operation
- Audio Quality: Sound effects and audio implementation (if applicable)

Please provide:
1. A detailed comparison of the scores for each criterion
2. An overall assessment of which content is better
3. The main reasons for your conclusion

Finally, provide your answer in <answer> </answer> tags with either "A" or "B" to indicate which content is better overall.
For example: <answer>A</answer> (meaning Content A is better overall)
"""

        if system_prompt is not None:
            messages = [{"role": "system", "content": system_prompt}]
        else:
            messages = []
        
        messages.append({"role": "user", "content": prompt})
        
        # Generate the response
        from vllm import SamplingParams
        sampling_params = SamplingParams(
            temperature=args.temp_coding,  # Use temperature 0 for consistent results
            top_p=args.top_p,
            top_k=args.top_k,
            repetition_penalty=args.repetition_penalty,
            max_tokens=args.generation_max_tokens,
            seed=current_seed
        )
        
        # Increment seed for next generation
        current_seed += 1
        
        # Generate the review
        response = coding_model.generate(messages, sampling_params)
        review = response[0].outputs[0].text
        
        # Extract the winner from the review
        answer_match = re.search(r'<answer>(.*?)</answer>', review, re.DOTALL)
        if answer_match:
            answer_content = answer_match.group(1).strip().upper()
            if answer_content == "A":
                comparison_results['OverallWinner'] = folder1_name
                comparison_results['OverallScore'] = -1  # Negative means Content A is better
            elif answer_content == "B":
                comparison_results['OverallWinner'] = folder2_name
                comparison_results['OverallScore'] = 1  # Positive means Content B is better
            else:
                comparison_results['OverallWinner'] = "Tie"
                comparison_results['OverallScore'] = 0
        else:
            # Fallback to simple average comparison if no clear answer
            comparison_results['OverallWinner'] = "Unknown"
            comparison_results['OverallScore'] = 0
        
        # Save the review
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        output_dir = args.output_dir if args.output_dir else folder1
        if args.output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        
        review_path = os.path.join(output_dir, f"single_evaluations_comparison_review_{folder2_name}_{timestamp}.txt")
        with open(review_path, "w", encoding="utf-8") as f:
            f.write(review)
        print(f"Coding agent's comparison review saved to: {review_path}")
        
    else:
        # Simple comparison based on average scores
        print("Comparing based on average scores...")
        
        # Calculate average scores (use MeanScore if available, otherwise calculate from individual scores)
        avg1 = scores1.get('MeanScore', None)
        if avg1 is None:
            # Calculate average from individual scores, excluding MeanScore itself
            individual_scores1 = {k: v for k, v in scores1.items() if k != 'MeanScore' and v != -1}
            avg1 = sum(individual_scores1.values()) / len(individual_scores1) if individual_scores1 else 0
        
        avg2 = scores2.get('MeanScore', None)
        if avg2 is None:
            # Calculate average from individual scores, excluding MeanScore itself
            individual_scores2 = {k: v for k, v in scores2.items() if k != 'MeanScore' and v != -1}
            avg2 = sum(individual_scores2.values()) / len(individual_scores2) if individual_scores2 else 0
        
        folder1_name = os.path.basename(os.path.normpath(folder1))
        folder2_name = os.path.basename(os.path.normpath(folder2))
        
        print(f"{folder1_name} average score: {avg1}")
        print(f"{folder2_name} average score: {avg2}")
        
        if avg1 > avg2:
            comparison_results['OverallWinner'] = folder1_name
            comparison_results['OverallScore'] = avg1 - avg2
        elif avg2 > avg1:
            comparison_results['OverallWinner'] = folder2_name
            comparison_results['OverallScore'] = avg2 - avg1
        else:
            comparison_results['OverallWinner'] = "Tie"
            comparison_results['OverallScore'] = 0
        
        # Add individual score differences for detailed comparison
        for key in scores1.keys():
            if key in scores2 and key != 'MeanScore':
                score1 = scores1[key] if scores1[key] != -1 else 0
                score2 = scores2[key] if scores2[key] != -1 else 0
                comparison_results[f"{key}Difference"] = score2 - score1  # Positive means folder2 is better
    
    # Print comparison results
    print("\n" + "="*50)
    print("COMPARISON RESULTS:")
    print("="*50)
    for key, value in comparison_results.items():
        print(f"{key}: {value}")
    
    # If dataset was provided, update the CSV file with comparison results
    if args.dataset and args.row_index and df is not None:
        try:
            if comparison_results:
                # Get folder names for column naming
                folder1_name = os.path.basename(os.path.normpath(folder1))
                folder2_name = os.path.basename(os.path.normpath(folder2))
                comparison_prefix = f"{folder1_name}_vs_{folder2_name}"
                
                # Update CSV with results
                update_csv_with_relative_results(args.dataset, args.row_index, comparison_results, comparison_prefix)
            else:
                print("Warning: Could not generate comparison results")
        except Exception as e:
            print(f"Error updating CSV file with comparison results: {e}")
    
    return comparison_results, current_seed

def evaluate_single_folder(args, folder, coding_model, evaluator_model, system_prompt, system_prompt_eval, current_seed, df=None, with_AI=True):
    """
    Evaluate content from a single folder.
    
    Args:
        args: Command line arguments
        folder: Folder to evaluate
        model: The model to use for evaluation
        system_prompt: System prompt for the model
        current_seed: Current random seed
        df: DataFrame from CSV dataset (optional)
    
    Returns:
        Tuple containing scores and updated seed
    """
    print(f"\n{'='*50}")
    print(f"Evaluating folder: {folder}")
    print(f"{'='*50}")
    
    # Set up paths
    video_path = os.path.join(folder, "final_content.mp4")
    audio_path = os.path.join(folder, "final_content.wav") if args.enable_audio else None
    console_logs_path = os.path.join(folder, "final_content_console_logs.txt")
    
    # Check if the video file exists
    if not os.path.exists(video_path):
        print(f"Error: Video file not found at {video_path}")
        return None, current_seed
    
    # Check if the audio file exists if audio is enabled
    if args.enable_audio and not os.path.exists(audio_path):
        print(f"Warning: Audio file not found at {audio_path}")
        audio_path = None
    
    # Check if console logs file exists
    if not os.path.exists(console_logs_path):
        print(f"Warning: Console logs file not found at {console_logs_path}")
        console_logs_path = None
    
    # Evaluate content
    print(f"Evaluating content: {video_path}")
    evaluations, current_seed = evaluate_content(
        system_prompt=system_prompt_eval,
        llm=evaluator_model,
        video_path=video_path,
        content_description=args.content_description,
        content_type=args.content_type,
        max_tokens=args.evaluation_max_tokens,
        console_logs_path=console_logs_path,
        seed=current_seed,
        top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty,
        with_AI=with_AI
    )
    
    # Generate timestamp for unique filenames
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Determine output directory
    output_dir = args.output_dir if args.output_dir else folder
    
    # Create output directory if it doesn't exist
    if args.output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    
    # Save all evaluations
    for i, evaluation in enumerate(evaluations):
        if i == 0:
            # First evaluation is with temperature=0
            eval_path = os.path.join(output_dir, f"evaluation_temp0_{timestamp}.txt")
        else:
            # Subsequent evaluations
            eval_path = os.path.join(output_dir, f"evaluation_temp_{i}_{timestamp}.txt")
        
        with open(eval_path, "w", encoding="utf-8") as f:
            f.write(evaluation)
    
    # Save the combined evaluation to a file
    evaluation_path = os.path.join(output_dir, f"evaluation_results_{timestamp}.txt")
    with open(evaluation_path, "w", encoding="utf-8") as f:
        f.write(combined_evaluation)
    print(f"\nEvaluation saved to: {evaluation_path}")
    
    # Process evaluations based on coding_evaluation flag
    if args.coding_evaluation:
        # Have coding agent review evaluations and determine final scores
        print("Using coding agent to review evaluations...")
        scores, review, current_seed = review_evaluations_with_coding_agent(
            system_prompt,
            coding_model,
            evaluations,
            args.content_description,
            content_type=args.content_type,
            max_tokens=args.generation_max_tokens,
            seed=current_seed,
            temp_coding=args.temp_coding,
            top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty,
            with_AI=with_AI
        )
        
        # Save the review
        review_path = os.path.join(output_dir, f"coding_evaluation_review_{timestamp}.txt")
        with open(review_path, "w", encoding="utf-8") as f:
            f.write(review)
        print(f"Coding agent's review saved to: {review_path}")
    else:
        # Use the existing method of extracting and averaging scores
        scores = extract_scores_from_evaluations(evaluations, args.content_type)
    
    # Print scores
    print("\n" + "="*50)
    print("SCORES:")
    print("="*50)
    for key, value in scores.items():
        print(f"{key}: {value}")
    
    
    return scores, current_seed

def main():
    """Main function to evaluate content from multiple folders."""
    args = parse_arguments()
    
    # Process CSV dataset if provided
    df = None
    if args.dataset:
        args, df = process_csv_dataset(args)
    
    # Check if folders_paired is provided and has the same length as folders
    if args.folders_paired and len(args.folders_paired) != len(args.folders):
        raise ValueError("--folders_paired must have the same length as --folders")
    
    # Get system prompts based on model
    system_prompt = get_system_prompt(args.model_path)
    system_prompt_eval = get_system_prompt(args.evaluator_model_path if args.use_separate_evaluator else args.model_path)
    
    # Setup models based on configuration
    if args.use_separate_evaluator:
        print("Using separate models for coding and evaluation")
        
        # Setup coding model
        coding_server_url = args.vllm_server_url
        coding_api_key = args.api_key
        print('<Coding-Model-setup>')
        coding_model = setup_model(
            args, 
            model_path=args.model_path,
            server_url=coding_server_url,
            api_key=coding_api_key
        )
        
        # Setup evaluator model
        evaluator_server_url = args.evaluator_vllm_server_url or args.vllm_server_url
        evaluator_api_key = args.evaluator_api_key or args.api_key
        print('<Evaluator-Model-setup>')
        evaluator_model = setup_model(
            args, 
            model_path=args.evaluator_model_path,
            server_url=evaluator_server_url,
            api_key=evaluator_api_key
        )
        print(f"Using visual evaluator model: {args.evaluator_model_path}")
        if args.evaluator_vllm_server_url:
            print(f"Using evaluator model server URL: {evaluator_server_url}")
    else:
        # Use a single model for both coding and evaluation
        print(f"Setting up model: {args.model_path}")
        coding_model = evaluator_model = setup_model(args)
        print(f"Using single model for both coding and evaluation: {args.model_path}")
    
    # Initialize current seed
    current_seed = args.seed
    
    # Evaluate each folder
    for i, folder in enumerate(args.folders):

        if args.folders_paired: # If folders_paired is provided
            paired_folder = args.folders_paired[i]
            
            if args.relative: 
                # Traditional relative evaluation (direct comparison)
                comparison_results, current_seed = evaluate_relative(args, folder, paired_folder, coding_model, evaluator_model, system_prompt, system_prompt_eval, current_seed, df, with_AI=not args.without_AI, file1=args.file, file2=args.file_paired)
                
                # Update CSV with relative results
                if args.dataset and args.row_index and df is not None:
                    try:
                        if comparison_results:
                            if args.name_is_output_dir:
                                comparison_prefix = os.path.basename(os.path.abspath(os.path.normpath(args.output_dir)))
                            else:
                                folder1_name = os.path.basename(os.path.normpath(folder))
                                folder2_name = os.path.basename(os.path.normpath(paired_folder))
                                comparison_prefix = f"{folder1_name}_vs_{folder2_name}"
                            
                            # For relative evaluation, save the winner string as requested
                            csv_results = {}
                            if 'OverallWinner' in comparison_results:
                                csv_results['Winner'] = comparison_results['OverallWinner']
                            else:
                                csv_results['Winner'] = "NA"
                            
                            # Update CSV with results
                            update_csv_with_relative_results(args.dataset, args.row_index, csv_results, comparison_prefix)
                        else:
                            print("Warning: Could not extract comparison results for CSV update")
                    except Exception as e:
                        print(f"Error updating CSV file with relative results: {e}")
                        
            else:
                # New mode: separate single evaluations followed by comparison
                print(f"\n{'='*60}")
                print(f"PSEUDO-RELATIVE EVALUATION MODE")
                print(f"Performing separate evaluations then comparing results")
                print(f"{'='*60}")
                
                # Evaluate first folder
                scores1, current_seed = evaluate_single_folder(args, folder, coding_model, evaluator_model, system_prompt, system_prompt_eval, current_seed, df, with_AI=not args.without_AI)
                
                # Evaluate second folder  
                scores2, current_seed = evaluate_single_folder(args, paired_folder, coding_model, evaluator_model, system_prompt, system_prompt_eval, current_seed, df, with_AI=not args.without_AI)
                
                # Compare the results
                if scores1 is not None and scores2 is not None:
                    comparison_results, current_seed = compare_single_evaluations(args, scores1, scores2, folder, paired_folder, coding_model, system_prompt, current_seed, df, with_AI=not args.without_AI)
                    
                    # Update CSV with pseudo-relative results
                    if args.dataset and args.row_index and df is not None:
                        try:
                            if comparison_results:
                                if args.name_is_output_dir:
                                    comparison_prefix = os.path.basename(os.path.abspath(os.path.normpath(args.output_dir)))
                                else:
                                    folder1_name = os.path.basename(os.path.normpath(folder))
                                    folder2_name = os.path.basename(os.path.normpath(paired_folder))
                                    comparison_prefix = f"{folder1_name}_vs_{folder2_name}"
                                
                                # For pseudo-relative evaluation, save the winner string as requested
                                csv_results = {}
                                if 'OverallWinner' in comparison_results:
                                    csv_results['Winner'] = comparison_results['OverallWinner']
                                else:
                                    csv_results['Winner'] = "NA"
                                
                                # Update CSV with results
                                update_csv_with_relative_results(args.dataset, args.row_index, csv_results, comparison_prefix)
                            else:
                                print("Warning: Could not extract comparison results for CSV update")
                        except Exception as e:
                            print(f"Error updating CSV file with pseudo-relative results: {e}")
                else:
                    print("Warning: Could not compare evaluations due to missing scores")
                    comparison_results = {}
        else: 
            # Evaluate the current folder (single evaluation mode)
            scores, current_seed = evaluate_single_folder(args, folder, coding_model, evaluator_model, system_prompt, system_prompt_eval, current_seed, df, with_AI=not args.without_AI)
            
            # Update CSV with single evaluation results
            if args.dataset and args.row_index and df is not None:
                try:
                    if scores:
                        if args.name_is_output_dir:
                            folder_name = os.path.basename(os.path.abspath(os.path.normpath(args.output_dir)))
                        else:
                            folder_name = os.path.basename(os.path.normpath(folder))
                        
                        # Update CSV with results
                        update_csv_with_results(args.dataset, args.row_index, scores, folder_name)
                    else:
                        print("Warning: Could not extract scores from evaluations for CSV update")
                except Exception as e:
                    print(f"Error updating CSV file with single evaluation results: {e}")

if __name__ == "__main__":
    main()
