import os
import time
import pandas as pd
import datetime
from vllm import SamplingParams
import re
import prompts
from prompts import _get_evaluation_criteria_items_with_desc
from video_utils import create_multimodal_relative_prompt_for_qwen, create_multimodal_relative_multiround_prompt_for_qwen
from prompts import get_description_feedback
from vllm import SamplingParams

def review_relative_evaluations_with_coding_agent(system_prompt, llm, comparisons, content_description, content_type="video-game", max_tokens=1024, seed=42, name1=None, name2=None, description_feedbacks=None, has_audio=False, temp_coding=0.7, top_p=0.95, top_k=-1, repetition_penalty=1.0, with_AI=True):
    """
    Have the coding agent review all relative evaluations and summarize the main recurring points to make a final comparison result.
    
    Args:
        system_prompt: System prompt for the LLM
        llm: The coding LLM
        comparisons: List of comparison texts
        content_description: Description of the content
        content_type: Type of content ("video-game", "animation", or "website")
        max_tokens: Maximum number of tokens to generate
        seed: Random seed for generation
        description_feedbacks: List of description feedbacks (optional)
    
    Returns:
        Final comparison results determined by the coding agent, the review text, and updated seed
    """
    print(f"Having coding agent review relative evaluations...")
    criteria = _get_evaluation_criteria_items_with_desc(content_type, has_audio, content_description, with_AI)
    
    # Create a simple prompt for reviewing relative evaluations
    # Since we don't have get_review_relative_evaluations_prompt in prompts.py, we'll create a basic one
    prompt = f"""You are reviewing multiple relative evaluations comparing two different implementations of a {content_type} with the following description:
"{content_description}".

EVALUATION CRITERIA:
{criteria}

The evaluations compare Content A vs Content B.

Here are all the evaluations to review:

"""
    
    for i, comparison in enumerate(comparisons):
        prompt += f"EVALUATION {i+1}:\n{comparison}\n\n"
    
    # Add description feedbacks if available
    if description_feedbacks and len(description_feedbacks) > 0:
        prompt += f"""
Additionally, here are {len(description_feedbacks)} descriptions of the content:

"""
        for i, desc in enumerate(description_feedbacks):
            prompt += f"DESCRIPTIONS {i+1}:\n{desc}\n\n"
        prompt += f"Use these descriptions of the contents about questions regarding the contents in order to help your assessment and determine the validity of the evaluations (in case of hallucinations during the evaluations).\n\n"
        
    prompt += f"""Based on all these evaluations, please provide a final summary of which content is better overall.

Look for consistent patterns across the evaluations and provide your final answer in <answer> </answer> tags with either "A" or "B" to indicate which content is better overall.
For example: <answer>A</answer> (meaning Content A is better overall)

Also provide a brief explanation of the main reasons why one content is better than the other.
"""

    if system_prompt is not None:
        messages = [{"role": "system", "content": system_prompt}]
    else:
        messages = []
    
    messages.append({"role": "user", "content": prompt})
    
    # Generate the response
    sampling_params = SamplingParams(
        temperature=temp_coding,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        max_tokens=max_tokens,
        seed=seed
    )
    
    # Increment seed for next generation
    seed += 1
    
    # Generate the review
    response = llm.generate(messages, sampling_params)
    review = response[0].outputs[0].text
    
    # Extract comparison results from the review
    comparison_results = {}
    
    answer_match = re.search(r'<answer>(.*?)</answer>', review, re.DOTALL)
    if answer_match:
        answer_content = answer_match.group(1).strip()

        # Simple relative evaluation - just determine which content is better
        winner = answer_content.strip().upper()
        if winner == "A":
            comparison_results['OverallWinner'] = f"{name1 if name1 else 'FolderA'}"
            comparison_results['OverallScore'] = -5  # Negative means Content A is better
        elif winner == "B":
            comparison_results['OverallWinner'] = f"{name2 if name2 else 'FolderB'}"
            comparison_results['OverallScore'] = 5  # Positive means Content B is better
        else:
            comparison_results['OverallWinner'] = "Tie"
            comparison_results['OverallScore'] = 0
    
    return comparison_results, review, seed, prompt

def create_relative_evaluation_prompt(content_description, content_type, has_audio, with_AI=True):
    """
    Create a prompt for relative evaluation between two contents.
    
    Args:
        content_description: Description of the content
        content_type: Type of content ("video-game", "animation", or "website")
        has_audio: Whether audio is enabled
    
    Returns:
        Prompt for relative evaluation
    """
    
    criteria = _get_evaluation_criteria_items_with_desc(content_type, has_audio, content_description, with_AI)

    prompt = f"""You are evaluating content A versus content B.

Consider these evaluation criteria:
{criteria}

Please provide a detailed analysis of both contents, comparing their strengths and weaknesses.
Then, determine which content is better overall and explain why.

Finally, provide your final answer in <answer> </answer> tags with either "A" or "B" to indicate which content is better.
For example: <answer>A</answer> (meaning Content A is better overall)
"""
    
    return prompt


def evaluate_relative(args, folder1, folder2, coding_model, evaluator_model, system_prompt, system_prompt_eval, current_seed, df=None, file1="final_content", file2="final_content", text_before=False, with_AI=True):
    """
    Perform a relative evaluation between two folders.
    
    Args:
        args: Command line arguments
        folder1: First folder to evaluate
        folder2: Second folder to evaluate
        model: The model to use for evaluation
        system_prompt: System prompt for the model
        current_seed: Current random seed
        df: DataFrame from CSV dataset (optional)
    
    Returns:
        Tuple containing comparison results and updated seed
    """
    
    # Set up paths for folder1
    video_path1 = os.path.join(folder1, file1 + ".mp4")
    audio_path1 = os.path.join(folder1, file1 + ".wav") if args.enable_audio else None
    if file1 == "final_content":
        name1 = os.path.basename(folder1) # name of folder
    else:
        name1 = file1 # name of file
    
    # Set up paths for folder2
    video_path2 = os.path.join(folder2, file2 + ".mp4")
    audio_path2 = os.path.join(folder2, file2 + ".wav") if args.enable_audio else None
    if file2 == "final_content":
        name2 = os.path.basename(folder2) # name of folder
    else:
        name2 = file2 # name of file

    print(f"\n{'='*50}")
    print(f"Performing relative evaluation: {name1} vs {name2}")
    print(f"{'='*50}")

    # Check if the video files exist
    if not os.path.exists(video_path1):
        print(f"Error: Video file not found at {video_path1}")
        return None, current_seed
    
    if not os.path.exists(video_path2):
        print(f"Error: Video file not found at {video_path2}")
        return None, current_seed
    
    # Check if the audio files exist if audio is enabled
    if args.enable_audio:
        if not os.path.exists(audio_path1):
            print(f"Warning: Audio file not found at {audio_path1}")
            audio_path1 = None
        
        if not os.path.exists(audio_path2):
            print(f"Warning: Audio file not found at {audio_path2}")
            audio_path2 = None
    
    # Determine output directory
    output_dir = args.output_dir if args.output_dir else folder1
    # Create output directory if it doesn't exist
    if args.output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    if args.multiround:
        text_before = False # you were shown

    # Create a prompt for relative evaluation
    prompt = create_relative_evaluation_prompt(
        args.content_description,
        args.content_type,
        args.enable_audio and audio_path1 is not None and audio_path2 is not None,
        with_AI=with_AI,
    )

    # Generate timestamp for unique filenames
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    comparisons = []
    description_feedbacks = []
    if args.multiround:
        # Generate description feedbacks
        print(f"Generating description feedbacks for coding evaluation...")

        if system_prompt_eval is not None:
            messages = [{"role": "system", "content": system_prompt_eval}]
        else:
            messages = []

        # Generate description feedback for folder1
        description_feedback1, current_seed = get_description_feedback(
            system_prompt_eval,
            evaluator_model,
            video_path1,
            audio_path=audio_path1,
            max_tokens=args.feedback_max_tokens,
            seed=current_seed,
            temp_feedback=0.0,
            direct=True,
            top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty
        )
        
        # Generate description feedback for folder2
        description_feedback2, current_seed = get_description_feedback(
            system_prompt_eval,
            evaluator_model,
            video_path2,
            audio_path=audio_path2,
            max_tokens=args.feedback_max_tokens,
            seed=current_seed,
            temp_feedback=0.0,
            direct=True,
            top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty
        )
        
        combined_description = f"CONTENT A DESCRIPTION:\n{description_feedback1}\n\nCONTENT B DESCRIPTION:\n{description_feedback2}"
        description_feedbacks.append(combined_description)

        content_a_prompt = "Please describe the video and the corresponding audio (Content A):"
        content_b_prompt = "Please describe the video and the corresponding audio (Content B):"
        messages = create_multimodal_relative_multiround_prompt_for_qwen(
            text_prompt=prompt, # Your job is relative evaluation
            text1=content_a_prompt, # Content A:
            video_path1=video_path1, # Content A - video
            audio_path1=audio_path1, # Content A - audio
            answer1=description_feedback1, # Content A - descrition (Assistant)
            text2=content_b_prompt, # Content B:
            video_path2=video_path2, # Content B - video
            audio_path2=audio_path2, # Content B - audio
            answer2=description_feedback2, # Content B - descrition (Assistant)
            messages=messages,
            sampling_rate=args.sampling_rate,
        )

        # Generate multiple evaluations
        print(f"Generating relative evaluation")
        sampling_params = SamplingParams(
            temperature=0.0,
            top_p=args.top_p,
            top_k=args.top_k,
            repetition_penalty=args.repetition_penalty,
            max_tokens=args.evaluation_max_tokens,
            seed=current_seed,
        )
        current_seed += 1
        response = evaluator_model.generate(messages, sampling_params)
        comparison = response[0].outputs[0].text
        comparisons.append(comparison)

    else:

        if system_prompt_eval is not None:
            messages = [{"role": "system", "content": system_prompt_eval}]
        else:
            messages = []

        content_a_prompt = "The following video with audio is Content A:"
        content_b_prompt = "The following video with audio is Content B:"
        messages = create_multimodal_relative_prompt_for_qwen(
            text_prompt=prompt, # Your job is relative evaluation
            text1=content_a_prompt, # Content A:
            video_path1=video_path1, # Content A - video
            audio_path1=audio_path1, # Content A - audio
            text2=content_b_prompt, # Content B:
            video_path2=video_path2, # Content B - video
            audio_path2=audio_path2, # Content B - audio
            messages=messages,
            sampling_rate=args.sampling_rate,
            text_before=text_before
        )
    
        # Generate evaluations
        print(f"Generating relative evaluation with temperature=0.0")
        sampling_params = SamplingParams(
            temperature=0.0,
            top_p=args.top_p,
            top_k=args.top_k,
            repetition_penalty=args.repetition_penalty,
            max_tokens=args.evaluation_max_tokens,
            seed=current_seed
        )
        current_seed += 1
        response = evaluator_model.generate(messages, sampling_params)
        comparison = response[0].outputs[0].text
        comparisons.append(comparison)
    
    # Process comparisons based on coding_evaluation flag
    if args.coding_evaluation:
        
        # Have coding agent review comparisons and determine final results
        print("Using coding agent to review relative evaluations...")
        comparison_results, review, current_seed, prompt_review = review_relative_evaluations_with_coding_agent(
            system_prompt,
            coding_model,
            comparisons,
            args.content_description,
            content_type=args.content_type,
            max_tokens=args.generation_max_tokens,
            seed=current_seed,
            name1=name1,
            name2=name2,
            description_feedbacks=description_feedbacks if description_feedbacks else None,
            has_audio=args.enable_audio,
            temp_coding=0.0,
            top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty,
            with_AI=with_AI,
        )

        # Save the review
        review_path = os.path.join(output_dir, f"coding_relative_evaluation_review_A_{name1}_vs_B_{name2}_{timestamp}.txt")
        with open(review_path, "w", encoding="utf-8") as f:
            f.write(prompt_review + "\n\nREVIEW:\n\n" + review)
        print(f"Coding agent's relative evaluation review saved to: {review_path}")
    else:
        # Extract comparative scores from all comparisons and average them
        import re
        
        # Initialize dictionaries to store all scores and counts
        all_scores = {}
        score_counts = {}
        comparison_results = {}
        
        # Process each comparison
        valid_comparisons = 0
        for i, comparison in enumerate(comparisons):
            answer_match = re.search(r'<answer>(.*?)</answer>', comparison, re.DOTALL)
            if answer_match:
                answer_content = answer_match.group(1).strip()
                
                # Simple relative evaluation - just determine which content is better
                winner = answer_content.strip().upper()
                winner_key = None
                winner_score = None
                
                if winner == "A":
                    winner_key = f"{name1}"
                    winner_score = -5  # Negative means Content A is better
                elif winner == "B":
                    winner_key = f"{name2}"
                    winner_score = 5  # Positive means Content B is better
                else:
                    # If the answer is not clearly A or B, try to determine from the text
                    if "A" in winner and "B" not in winner:
                        winner_key = f"{name1}"
                        winner_score = -5
                    elif "B" in winner and "A" not in winner:
                        winner_key = f"{name2}"
                        winner_score = 5
                    else:
                        winner_key = "Tie"
                        winner_score = 0
                
                # Add to scores dictionary (simple relative always has complete scores)
                valid_comparisons += 1
                if 'OverallWinner' not in all_scores:
                    all_scores['OverallWinner'] = {}
                if winner_key not in all_scores['OverallWinner']:
                    all_scores['OverallWinner'][winner_key] = 0
                all_scores['OverallWinner'][winner_key] += 1
                
                if 'OverallScore' not in all_scores:
                    all_scores['OverallScore'] = []
                all_scores['OverallScore'].append(winner_score)
        
        # Log how many evaluations had valid scores
        print(f"Found valid scores in {valid_comparisons} out of {len(comparisons)} evaluations")
        if valid_comparisons == 0:
            print("Warning: No valid scores found in any evaluation")
            return {}, current_seed
            
        # Calculate final results
        if 'OverallWinner' in all_scores:
            # For simple evaluation, use the most common winner
            winner_counts = all_scores['OverallWinner']
            max_count = 0
            max_winner = None
            for winner, count in winner_counts.items():
                if count > max_count:
                    max_count = count
                    max_winner = winner
            
            comparison_results['OverallWinner'] = max_winner
            # Average the scores
            if 'OverallScore' in all_scores and all_scores['OverallScore']:
                comparison_results['OverallScore'] = sum(all_scores['OverallScore']) / len(all_scores['OverallScore'])
    
    # Print comparison results
    print("\n" + "="*50)
    print("COMPARISON RESULTS:")
    print("="*50)
    for key, value in comparison_results.items():
        print(f"{key}: {value}")
    
    return comparison_results, current_seed

