import argparse
import os
import time
import cv2
import numpy as np
import torch
import random
from PIL import Image
import base64
import pathlib
import shutil
import tempfile
import subprocess
import fnmatch
import re
from io import StringIO
import threading
from pygltflib import GLTF2
import librosa
import soundfile as sf
import csv
import pandas as pd
import json
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
from relative_eval import evaluate_relative

# Import video utilities
from video_utils import create_multimodal_prompt_for_qwen, record_content
# Import VLLM client
from vllm_client import VLLMServerWrapper
# Import prompts
import prompts
from prompts import get_description_feedback, get_technical_feedback, get_evaluator_feedback
# Import asset utilities
from assets import scan_asset_directory, parse_gitignore, is_ignored_by_gitignore, process_asset_file, write_directory_tree, select_assets_with_model
# Import VLLM LLM for local model loading (optional)
from vllm import LLM, SamplingParams
import vllm.envs as envs
from vllm.assets.video import VideoAsset

def get_system_prompt(model_path):
    """
    Get the appropriate system prompt based on the model path.
    
    Args:
        model_path: Path to the model
    
    Returns:
        System prompt or None
    """
    return prompts.get_system_prompt(model_path)


def parse_arguments():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="VLM Agent for Iterative Video Game Building")
    
    # Model selection
    model_group = parser.add_mutually_exclusive_group()
    model_group.add_argument("--use_vllm_server", action="store_true",
                        help="Use VLLM server instead of loading model locally")
    
    # Asset directory
    parser.add_argument("--asset_dir", type=str, default=None,
                        help="Directory containing picture assets to be used in the game")

    # Local VLM settings
    parser.add_argument("--vllm_gpu_memory_utilization", type=float, default=0.8,
                        help="GPU memory utilization for VLLM")
    parser.add_argument("--tensor_parallel_size", type=int, default=1,
                        help="Tensor parallel size for VLLM")
    
    # Content settings
    parser.add_argument("--content_type", type=str, choices=["video-game", "animation", "website"], default="video-game",
                        help="Type of content to generate (video-game, animation, or website)")
    parser.add_argument("--content_description", type=str, default="",
                        help="Description of the content to be created")
    parser.add_argument("--output_dir", type=str, default="output",
                        help="Directory to save the final content and videos")

    # CSV dataset settings (if provided, it will extract content_description from the dataset)
    parser.add_argument("--dataset", type=str, default=None,
                        help="Path to a CSV file to use for content description and result storage")
    parser.add_argument("--row_index", type=int, default=None,
                        help="Row index in the CSV file to use (between 1 and 50)")
    

    parser.add_argument("--min_iterations", type=int, default=10,
                        help="Minimum number of improvement iterations (k); will stop when max_iter is reached or console logs is empty")
    parser.add_argument("--max_iterations", type=int, default=100,
                        help="Minimum number of improvement iterations (k)")
    
    # Video recording settings
    parser.add_argument("--chromium_path", type=str, required=True,
                        help="Path to chromium")
    parser.add_argument("--video_duration", type=int, default=20,
                        help="Duration of gameplay/animation video in seconds")
    parser.add_argument("--video_fps", type=int, default=1,
                        help="Frames per second for gameplay/animation video")
    parser.add_argument("--server_root", type=str, default='',
                        help="Root of server host, should be above the assets and the html (if left as '' its not used)")
    
    # Browser settings are now automatically set to full screen
    
    # Generation settings
    parser.add_argument("--generation_max_tokens", type=int, default=None,
                        help="Maximum number of tokens for content generation and improvement")
    parser.add_argument("--feedback_max_tokens", type=int, default=3200,
                        help="Maximum number of tokens for content evaluation")
    parser.add_argument("--evaluation_max_tokens", type=int, default=3200,
                        help="Maximum number of tokens for content evaluation")
    
    # Debug settings
    parser.add_argument("--debug", action="store_true",
                        help="Enable debug mode to save and evaluate the content at every iteration")
    
    # CDN settings
    parser.add_argument("--cdn_allowed", action="store_true", default=True,
                        help="Allow external JavaScript libraries via CDN (default: True)")
    
    # Audio settings
    parser.add_argument("--enable_audio", action="store_true",
                        help="Enable audio recording and processing")
    parser.add_argument("--sampling_rate", type=int, default=44100,
                        help="Sampling rate for audio")
    parser.add_argument("--monitor_source", type=str, default="default",
                        help="***IMPORTANT: FOR AUDIO, YOU MUST SET UP AUDIOPULSE IN ADVANCE AND PROVIDE THE SOURCE")

    # Asset settings
    parser.add_argument("--allowed_asset_types", type=str, nargs="+", 
                        default=["png","jpeg","jpg","ogg","mp3","wav","glb"],
                        help="List of allowed asset file extensions")
    parser.add_argument("--max_sample_packs", type=int, default=1,
                        help="Maximum number of sample packs to select from each modality")
    parser.add_argument("--select_assets", action="store_true",
                        help="Prompt the coding model to choose asset packs and assets")
    parser.add_argument("--select_sample_packs", action="store_true",
                        help="Prompt the coding model to choose only asset packs and use all assets in those packs")
    parser.add_argument("--assets_selection", type=str, choices=["individual", "combined"], default="individual",
                        help="How to select assets: 'individual' processes each pack separately, 'combined' selects from all packs at once")
    parser.add_argument("--max_assets", type=int, default=20,
                        help="Maximum number of assets to select when using --select_assets")
    parser.add_argument("--max_assets_per_pack", type=int, default=400, # choose a reasonable number
                        help="Maximum number of assets per pack to process in a single selection (if exceeded, pack will be split into multiple selections)")
    parser.add_argument("--max_tries", type=int, default=10,
                        help="Maximum number of retry attempts if asset selection fails")
    parser.add_argument("--assets_structure", type=str, choices=["compressed", "full"], default="compressed",
                        help="How to display asset directory structure ('full' shows complete paths, 'compressed' uses connectors)")
    
    
    # Coding model settings
    parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-VL-32B-Instruct",
                        help="Path to the VLM model")
    parser.add_argument("--vllm_server_url", type=str, default="http://localhost:8000",
                        help="URL of the VLLM server (default: http://localhost:8000)")
    parser.add_argument("--api_key", type=str, default="token-abc123",
                        help="API key for the VLLM server (default: token-abc123)")
    
    # Assets agent settings
    parser.add_argument("--assets_agent_model_path", type=str, default=None,
                        help="Path to the model for asset selection (if not set, uses coding agent)")
    parser.add_argument("--assets_agent_vllm_server_url", type=str, default=None,
                        help="URL of the VLLM server for assets agent (defaults to --vllm_server_url if not specified)")
    parser.add_argument("--assets_agent_api_key", type=str, default=None,
                        help="API key for the VLLM server for assets agent (defaults to --api_key if not specified)")

    parser.add_argument("--server_timeout", type=int, default=600, # 10min timeout
                        help="Timeout time for queries on a vllm server")


    # Separate evaluator model settings
    parser.add_argument("--use_separate_evaluator", action="store_true",
                        help="Use separate models for coding and evaluation")

    # Evaluator model settings
    parser.add_argument("--evaluator_model_path", type=str, 
                        default="Qwen/Qwen2.5-VL-32B-Instruct",
                        help="Path to the evaluator model (must support video)")
    parser.add_argument("--evaluator_vllm_server_url", type=str, default=None,
                        help="URL of the VLLM server for the evaluator model (defaults to --vllm_server_url if not specified)")
    parser.add_argument("--evaluator_api_key", type=str, default=None,
                        help="API key for the VLLM server for the evaluator model (defaults to --api_key if not specified)")
    
    # Separate feedbacks
    parser.add_argument("--description_feedback", action="store_true",
                        help="Ask the feedback agent to describe the video and audio before providing feedback")
    parser.add_argument("--no_feedback", action="store_true",
                        help="If true, dont provide the regular feedback (then you can give it description or meta-prompt instead)")

    # Video feedback settings
    parser.add_argument("--without_video_feedback", action="store_true",
                        help="Disable video feedback during development (code improvements without seeing results)")

    # Reproducibility settings
    parser.add_argument("--seed", type=int, default=42,
                        help="Random seed for reproducibility (default: 42)")
    
    # Best-of-k settings
    parser.add_argument("--best_of_k", type=int, default=1,
                        help="Number of parallel candidates to generate and select best from (default: 1)")
    parser.add_argument("--compare_to_current_code", action="store_true",
                        help="If True, when doing best_of_k, compare current code vs first candidate, then best vs remaining candidates. Even with best_of_k=1, comparison still happens.")
    parser.add_argument("--initial_best_of_k", type=int, default=1,
                        help="For the initial prompt only: Number of parallel candidates to generate and select best from (default: 1)")
    parser.add_argument("--initial_best_of_k_no_console_error", action="store_true",
                        help="If True, when doing best-of-k for initial, only consider candidates without 'SEVERE' console errors if any exist. If all candidates have SEVERE errors, consider all candidates.")
    parser.add_argument("--best_of_k_no_console_error", action="store_true",
                        help="If True, when doing best-of-k, only consider candidates without 'SEVERE' console errors if any exist. If all candidates have SEVERE errors, consider all candidates.")
    parser.add_argument("--multiround", action="store_true",
                        help="If True, use multiround description, then comparision for the relative evaluation")

    # Temperature settings
    parser.add_argument("--temp_coding", type=float, default=0.6,
                        help="Temperature for initial content and improvement steps")
    parser.add_argument("--temp_feedback", type=float, default=0.0,
                        help="Temperature for all feedback steps")
    parser.add_argument("--top_p", type=float, default=0.95,
                        help="")
    parser.add_argument("--top_k", type=int, default=-1,
                        help="")
    parser.add_argument("--repetition_penalty", type=float, default=1.00,
                        help="")

    # Evaluation settings
    parser.add_argument("--coding_evaluation", action="store_true",
                        help="If True, show all evaluations to the coding agent for review instead of averaging scores")

    # Early exit settings
    parser.add_argument("--early_exit", action="store_true",
                        help="Allow model to exit early if satisfied with the current content")
    # Search/replace settings
    parser.add_argument("--search_replace", action="store_true",
                        help="Allow the model to use search/replace for targeted code changes during improvement steps")
    
    # Memory settings
    parser.add_argument("--use_memory", action="store_true",
                        help="Enable memory functionality to provide coding agent with past improvement notes")
    parser.add_argument("--memory_len", type=int, default=3,
                        help="Number of past memories to keep and provide to the coding agent (default: 3)")
    # Xvfb Display number, it must be unique for each parallel process (so if you run 4 processes of video_game_builder.py simultaneously, they must have a unique display_num)
    parser.add_argument("--display_num", type=int, default=None,
                        help="Allow model to exit early if satisfied with the current content")
    
    # Auto-resume functionality
    parser.add_argument("--auto_resume", action="store_true",
                        help="Automatically resume from previous progress if final_content files or temp_content files exist")
    
    return parser.parse_args()

def setup_model(args, model_path=None, server_url=None, api_key=None):
    """
    Initialize the model (VLLM local, VLLM server).
    
    Args:
        args: Command line arguments
        model_path: Optional override for the model path
        server_url: Optional override for the VLLM server URL
        api_key: Optional override for the VLLM server API key
    
    Returns:
        Initialized model
    """
    # Use the provided model_path if specified, otherwise use args.model_path
    model_path = model_path or args.model_path
    
    if args.use_vllm_server:
        # Use the provided server_url and api_key if specified, otherwise use the defaults
        server_url = server_url or args.vllm_server_url
        api_key = api_key or args.api_key
        print(f"Using VLLM server at: {server_url}")
        model = VLLMServerWrapper(server_url=server_url, api_key=api_key, model=model_path, timeout=args.server_timeout)
    else:
        if LLM is None:
            raise ImportError("vllm module not found. Please install vllm or use --use_vllm_server option.")
        
        print(f"Loading model locally: {model_path}")
        
        # Set up model parameters
        model_kwargs = {
            "model": model_path,
            "gpu_memory_utilization": args.vllm_gpu_memory_utilization,
            "tensor_parallel_size": args.tensor_parallel_size,
            "trust_remote_code": True,
            "dtype": "bfloat16"
        }
        
        # Initialize the VLLM model
        model = LLM(**model_kwargs)
    
    # Determine the modalities handled by the model
    model.is_text_only = "VL" not in model_path and "Omni" not in model_path and "Orsta" not in model_path and "EchoInk" not in model_path
    model.model_process_audio = args.enable_audio and not model.is_text_only and ("Omni" in model_path or "EchoInk" in model_path) # can the model process audio
    model.content_has_audio = args.enable_audio # do the generated content contains audio
    model.sampling_rate = args.sampling_rate
    if model.is_text_only:
        print(f"Model {model_path} can only process text")
    elif model.model_process_audio:
        print(f"Model {model_path} can process video and audio")
    else:
        print(f"Model {model_path} can process video (no audio)")
    
    return model

def generate_initial_content(system_prompt, llm, content_description, content_type="video-game", max_tokens=4096, asset_info=None, seed=42, temp_coding=0.7, top_p=0.95, top_k=-1, repetition_penalty=1.0, cdn_allowed=True):
    """
    Generate the initial HTML content (game or animation) using the VLM.
    
    Args:
        llm: The VLLM model
        content_description: Description of the content to create
        content_type: Type of content to generate ("video-game" or "animation")
        max_tokens: Maximum number of tokens to generate
        cdn_allowed: Whether to allow CDN libraries
    
    Returns:
        The generated HTML code
    """
    print(f"Generating initial {content_type}...")
    
    # Construct the prompt
    prompt = prompts.get_initial_content_prompt(content_description, content_type, asset_info, llm.content_has_audio, cdn_allowed)

    # Generate the response
    sampling_params = SamplingParams(
        temperature=temp_coding,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        max_tokens=max_tokens,
        seed=seed
    )
    
    # Increment seed for next generation to ensure different outputs
    seed += 1
    
    # No conversation history, just use the prompt directly
    if system_prompt is not None:
        messages = [{"role": "system", "content": system_prompt}]
    else:
        messages = []
    messages += [{"role": "user", "content": prompt}]

    # Standard generation for server or non-audio models
    print(prompt)
    response = llm.generate(messages, sampling_params)
    
    generated_text = response[0].outputs[0].text
    messages += [{"role": "assistant", "content": generated_text}]
    
    return generated_text, messages, seed


def generate_improvement(system_prompt, llm, current_code, video_path, content_description, content_type="video-game", conversation_history=None, max_tokens=4096, i=0, asset_info=None, evaluator_feedback=None, console_logs_path=None, seed=42, early_exit=False, temp_coding=0.7, search_replace=False, memories=None, top_p=0.95, top_k=-1, repetition_penalty=1.0, cdn_allowed=True):
    """
    Generate an improved version of the content using the VLM.
    
    Args:
        llm: The VLLM model
        current_code: Current HTML code of the content
        video_path: Path to the content video
        content_description: Description of the content
        content_type: Type of content ("video-game" or "animation")
        conversation_history: Previous conversation history for continual prompting
        max_tokens: Maximum number of tokens to generate
        i: Iteration number
        asset_info: Asset information for prompts (optional)
        evaluator_feedback: Feedback from the evaluator model (optional)
        console_logs_path: Path to the console logs file (optional)
        seed: Random seed for generation
        early_exit: Whether to allow early exit
        temp_coding: Temperature for generation
        search_replace: Whether to allow search/replace format
        memories: List of past memory entries (optional)
    
    Returns:
        The improved HTML code
    """
    print(f"Generating {content_type} improvement...")

    # Get critical issues for the content type
    critical_issues = prompts.get_critical_issues(content_type, llm.content_has_audio)
    
    # Add evaluator feedback if available
    feedback_section = prompts.get_feedback_section(evaluator_feedback)
    
    # Add console logs to the prompt if available
    console_logs_section = prompts.get_console_logs_section(console_logs_path)
    
    # Construct the text part of the prompt
    if conversation_history and i > 0:
        # For iterations after the first with conversation history, use a simpler prompt
        text_prompt = prompts.get_simplified_improvement_prompt(content_type, feedback_section, early_exit, search_replace, memories)
    else:
        # For first iteration or without conversation history, use the full prompt
        text_prompt = prompts.get_improvement_prompt(content_description, content_type, current_code, asset_info, feedback_section, console_logs_section, llm.content_has_audio, early_exit, search_replace, memories)
    
    # Use conversation history if available
    if conversation_history:
        messages = conversation_history
    else: # No conversation history, start fresh
        if system_prompt is not None:
            messages = [{"role": "system", "content": system_prompt}]
        else:
            messages = []
    
    # For text-only models or when video_path is None (without_video_feedback=True), 
    # we don't include the video in the prompt
    if llm.is_text_only or video_path is None:
        # Just add the text prompt without the video
        messages.append({"role": "user", "content": text_prompt})
    else: # For multimodal models with video, include the video
        messages = create_multimodal_prompt_for_qwen(
            text_prompt, 
            video_path=video_path,
            audio_path=os.path.splitext(video_path)[0] + ".wav" if llm.model_process_audio and video_path is not None else None,
            messages=messages,
            sampling_rate=llm.sampling_rate
        )
    
    # Generate the response
    print(max_tokens)
    sampling_params = SamplingParams(
        temperature=temp_coding,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        max_tokens=max_tokens,
        seed=seed
    )
    
    # Increment seed for next generation to ensure different outputs
    seed += 1
    
    # Standard generation for server or non-audio models
    response = llm.generate(messages, sampling_params)
    
    generated_text = response[0].outputs[0].text
    messages += [{"role": "assistant", "content": generated_text}]
    
    return generated_text, messages, seed

def evaluate_content(system_prompt, llm, video_path, content_description, content_type="video-game", max_tokens=1024, console_logs_path=None, seed=42, top_p=0.95, top_k=-1, repetition_penalty=1.0, with_AI=True):
    """
    Evaluate the content using the VLM.
    
    Args:
        llm: The VLLM model
        video_path: Path to the content video
        content_description: Description of the content
        content_type: Type of content ("video-game", "animation", or "website")
        max_tokens: Maximum number of tokens to generate
        console_logs_path: Path to the console logs file (optional)
    
    Returns:
        Evaluation score and feedback
    """
    print(f"Evaluating {content_type}...")
    
    # Get console logs section
    console_logs_section = prompts.get_console_logs_section(console_logs_path)
    
    # Construct the text part of the prompt
    text_prompt = prompts.get_evaluation_prompt(content_description, content_type, console_logs_section, llm.model_process_audio, with_AI=with_AI)
    
    # Create a multimodal prompt for Qwen VL with video (and audio if enabled)
    if system_prompt is not None:
        messages = [{"role": "system", "content": system_prompt}]
    else:
        messages = []
    
    messages = create_multimodal_prompt_for_qwen(
        text_prompt, 
        video_path=video_path,
        audio_path=os.path.splitext(video_path)[0] + ".wav" if llm.model_process_audio else None,
        messages=messages,
        sampling_rate=llm.sampling_rate
    )
    
    if llm.model_process_audio:
        print("Evaluating content with audio enabled")
    
    # First, evaluate with temperature=0
    sampling_params_temp0 = SamplingParams(
        temperature=0.0,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        max_tokens=max_tokens,
        seed=seed
    )
    
    # Increment seed for next generation
    seed += 1
    
    response_temp0 = llm.generate(messages, sampling_params_temp0)
    evaluation_temp0 = response_temp0[0].outputs[0].text
    
    # Combine all evaluations
    all_evaluations = [evaluation_temp0]
    
    # Return the combined evaluations and the updated seed
    return all_evaluations, seed


def clean_search_replace_markers(code):
    """
    Remove any stray search/replace markers from the code that might have gotten stuck.
    
    Args:
        code: The code to clean
    
    Returns:
        The cleaned code with markers removed
    """
    if code is None:
        return None
    
    # Remove search/replace markers that might be stuck in the code
    markers_to_remove = [
        "<<<<<<< SEARCH",
        "=======",
        ">>>>>>> REPLACE"
    ]
    
    cleaned_code = code
    for marker in markers_to_remove:
        # Remove the marker and any surrounding whitespace/newlines
        lines = cleaned_code.split('\n')
        cleaned_lines = []
        for line in lines:
            # Skip lines that contain only the marker (with optional whitespace)
            if line.strip() != marker:
                # Also remove the marker if it appears at the beginning or end of a line
                line = line.replace(marker, '')
                cleaned_lines.append(line)
        cleaned_code = '\n'.join(cleaned_lines)
    
    # Remove any empty lines that might have been created by marker removal
    lines = cleaned_code.split('\n')
    cleaned_lines = []
    prev_empty = False
    for line in lines:
        if line.strip() == '':
            if not prev_empty:  # Keep only one empty line in a row
                cleaned_lines.append(line)
                prev_empty = True
        else:
            cleaned_lines.append(line)
            prev_empty = False
    
    return '\n'.join(cleaned_lines)

def validate_search_replace_content(search_content, replace_content):
    """
    Validate that search/replace content doesn't contain problematic markers.
    
    Args:
        search_content: The search content to validate
        replace_content: The replace content to validate
    
    Returns:
        True if content is valid, False otherwise
    """
    problematic_markers = ["<<<<<<< SEARCH", "=======", ">>>>>>> REPLACE"]
    
    for marker in problematic_markers:
        if marker in search_content or marker in replace_content:
            print(f"Warning: Search/replace content contains problematic marker: {marker}")
            return False
    
    return True

def process_search_replace_blocks(generated_text, current_code):
    """
    Process search/replace blocks in the generated text and apply them to the current code.
    
    Args:
        generated_text: The text generated by the model
        current_code: The current HTML code
    
    Returns:
        The updated HTML code after applying all search/replace blocks
    """
    # Check if the response contains search/replace blocks
    if "<<<<<<< SEARCH" not in generated_text or ">>>>>>> REPLACE" not in generated_text:
        return None
    
    # First, clean any existing markers from current_code to prevent accumulation
    print("Cleaning existing search/replace markers from current code...")
    updated_code = clean_search_replace_markers(current_code)
    
    # Find all search/replace blocks
    blocks = []
    search_markers = [m.start() for m in re.finditer(r'<<<<<<< SEARCH', generated_text)]
    
    print(f"Found {len(search_markers)} search/replace blocks to process")
    
    for i, start_idx in enumerate(search_markers):
        try:
            # Find the end of the search block
            separator_idx = generated_text.find("=======", start_idx)
            if separator_idx == -1:
                print(f"Warning: Block {i+1} missing separator '======='")
                continue
            
            # Find the end of the replace block
            end_idx = generated_text.find(">>>>>>> REPLACE", separator_idx)
            if end_idx == -1:
                print(f"Warning: Block {i+1} missing end marker '>>>>>>> REPLACE'")
                continue
            
            # Extract the search and replace content
            search_content = generated_text[start_idx + len("<<<<<<< SEARCH"):separator_idx].strip()
            replace_content = generated_text[separator_idx + len("======="):end_idx].strip()
            
            # Validate the content doesn't contain problematic markers
            if not validate_search_replace_content(search_content, replace_content):
                print(f"Warning: Block {i+1} contains problematic markers, skipping")
                continue
            
            # Additional validation: ensure search content is not empty
            if not search_content:
                print(f"Warning: Block {i+1} has empty search content, skipping")
                continue
            
            blocks.append((search_content, replace_content))
            print(f"Block {i+1}: Valid search/replace block parsed")
            
        except Exception as e:
            print(f"Error parsing search/replace block {i+1}: {e}")
            continue
    
    print(f"Successfully parsed {len(blocks)} valid search/replace blocks")
    
    # Apply each search/replace block
    successful_replacements = 0
    for i, (search_content, replace_content) in enumerate(blocks):
        try:
            # Check if the search content exists in the current code
            if search_content not in updated_code:
                print(f"Warning: Block {i+1} search content not found in current code")
                continue
            
            # Replace the first occurrence of the search content
            old_code = updated_code
            updated_code = updated_code.replace(search_content, replace_content, 1)
            
            if old_code != updated_code:
                successful_replacements += 1
                print(f"Applied search/replace block {i+1}: {len(search_content)} chars -> {len(replace_content)} chars")
            else:
                print(f"Warning: Block {i+1} replacement had no effect")
                
        except Exception as e:
            print(f"Error applying search/replace block {i+1}: {e}")
            continue
    
    print(f"Successfully applied {successful_replacements} out of {len(blocks)} search/replace blocks")
    
    # Final cleanup: remove any markers that might have been introduced
    final_code = clean_search_replace_markers(updated_code)
    
    return final_code

def extract_html_code(generated_text, current_code=None, use_memory=False):
    """
    Extract HTML code from the generated text.
    
    Args:
        generated_text: The text generated by the model
        current_code: The current HTML code
    
    Returns:
        The extracted or updated HTML code
        Boolean flag on success of change or not
        Notes
    """
    try:
        # First, remove thinking process if present (before anything else)
        # Find the latest occurrence of thinking suffixes
        pos1 = generated_text.rfind("</think>")
        pos2 = generated_text.rfind("◁/think▷")
        latest_pos = max(pos1, pos2)
        
        if latest_pos != -1:
            if latest_pos == pos1:
                generated_text = generated_text[latest_pos + len("</think>"):]
                print("Removed thinking process content before latest '</think>' suffix")
            else:
                generated_text = generated_text[latest_pos + len("◁/think▷"):]
                print("Removed thinking process content before latest '◁/think▷' suffix")
        
        # Extract and store notes (if memory is enabled)
        if use_memory:
            notes = extract_notes_from_response(generated_text)
        else:
            notes = None

        # Then, check if the response contains search/replace blocks
        if current_code is not None:
            updated_code = process_search_replace_blocks(generated_text, current_code)
            if updated_code is not None:
                return updated_code, True, notes
        
        # If no search/replace blocks or processing failed, extract full HTML code
        # Try to extract code between ```html and ``` tags
        # First, check if the prefix exists
        if "```html" in generated_text:
            # Find the position of the prefix
            prefix_pos = generated_text.find("```html")
            # Look for content after the prefix
            content_after_prefix = generated_text[prefix_pos + len("```html"):]
            
            # Check if there's a closing ``` after the prefix
            if "```" in content_after_prefix:
                # Extract the content between the markers
                html_content = content_after_prefix.split("```")[0].strip()
                if html_content:
                    return html_content, True, notes
            else:
                print("Warning: Found ```html prefix but no closing ``` suffix")
        else:
            print("Warning: No ```html prefix found in generated text")

        return current_code if current_code is not None else "", False, notes
    except Exception as e:
        print(f"Error extracting HTML code: {e}")
        return current_code if current_code is not None else "", False, notes

def load_memory_state(output_dir):
    """
    Load existing memory from file.
    
    Args:
        output_dir: Directory containing the memory file
    
    Returns:
        List of memory entries or empty list if not found
    """
    try:
        memory_file_path = os.path.join(output_dir, "memory_state.json")
        if not os.path.exists(memory_file_path):
            return []
        
        with open(memory_file_path, "r", encoding="utf-8") as f:
            memories = json.load(f)
        
        print(f"Loaded {len(memories)} memory entries from: {memory_file_path}")
        return memories
        
    except Exception as e:
        print(f"Warning: Failed to load memory state: {e}")
        return []

def save_memory_state(output_dir, memories):
    """
    Save memory to file for persistence.
    
    Args:
        output_dir: Directory to save the memory file
        memories: List of memory entries to save
    """
    try:
        memory_file_path = os.path.join(output_dir, "memory_state.json")
        with open(memory_file_path, "w", encoding="utf-8") as f:
            json.dump(memories, f, indent=2, ensure_ascii=False)
        
        print(f"Saved {len(memories)} memory entries to: {memory_file_path}")
        
    except Exception as e:
        print(f"Warning: Failed to save memory state: {e}")

def add_memory_entry(memories, iteration, notes, max_len):
    """
    Add new memory entry and maintain max length.
    
    Args:
        memories: List of existing memory entries
        iteration: Current iteration number
        notes: Notes about what was done in this iteration
        max_len: Maximum number of memories to keep
    
    Returns:
        Updated list of memories
    """
    if not notes or not notes.strip():
        print(f"Warning: Empty notes for iteration {iteration}, not adding to memory")
        return memories
    
    # Create new memory entry
    new_entry = {
        "iteration": iteration,
        "notes": notes.strip(),
        "timestamp": time.time()
    }
    
    # Add to memories
    memories.append(new_entry)
    
    # Keep only the last max_len entries
    if len(memories) > max_len:
        memories = memories[-max_len:]
        print(f"Memory trimmed to last {max_len} entries")
    
    print(f"Added memory entry for iteration {iteration}: {notes[:100]}{'...' if len(notes) > 100 else ''}")
    return memories

def extract_notes_from_response(response_text):
    """
    Extract notes from model response.
    
    Args:
        response_text: The full response text from the model
    
    Returns:
        Extracted notes or None if not found
    """
    try:
        # Look for notes in various formats
        pattern = r'<notes>(.*?)</notes>'
        
        match = re.search(pattern, response_text, re.DOTALL | re.IGNORECASE)
        if match:
            notes = match.group(1).strip()
            # Clean up the notes - remove extra whitespace and limit to 3 lines
            lines = [line.strip() for line in notes.split('\n') if line.strip()]
            if lines:
                # Take only first 3 lines and join them
                cleaned_notes = '\n'.join(lines[:3])
                print(f"Extracted notes using pattern: {pattern}")
                return cleaned_notes
        
        print("No notes found in response using standard patterns")
        return None
        
    except Exception as e:
        print(f"Error extracting notes from response: {e}")
        return None

def set_seed(seed):
    """
    Set the random seed for all relevant libraries to ensure reproducibility.
    
    Args:
        seed: The seed value to use
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Set deterministic behavior for CUDA if available
    if torch.cuda.is_available():
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    print(f"Random seed set to {seed} for reproducibility")

def process_csv_dataset(args):
    """
    Process the CSV dataset if provided and update content_description.
    
    Args:
        args: Command line arguments
    
    Returns:
        Updated args with content_description set from CSV if applicable
    """
    if not args.dataset:
        return args
    
    try:
        # Validate row_index
        if args.row_index is None or args.row_index < 1 or args.row_index > 50:
            raise ValueError("Row index must be between 1 and 50")
        
        # Read the CSV file
        df = pd.read_csv(args.dataset)
        
        # Check if the CSV has at least row_index rows
        if len(df) < args.row_index:
            raise ValueError(f"CSV file has only {len(df)} rows, but row_index is {args.row_index}")
        
        # Get the row (subtract 1 because row_index is 1-based, but DataFrame indices are 0-based)
        row = df.iloc[args.row_index - 1]
        
        # Get the first two columns
        if len(df.columns) < 2:
            raise ValueError("CSV file must have at least 2 columns")
        
        col1_name = df.columns[0]
        col2_name = df.columns[1]
        
        # Set content_description from the CSV data
        args.content_description = f"{row[col1_name]} - {row[col2_name]}"
        print(f"Using content description from CSV: '{args.content_description}'")
        
        return args, df
    
    except Exception as e:
        print(f"Error processing CSV dataset: {e}")
        raise

def extract_scores_from_evaluation(evaluation_text, content_type="video-game"):
    """
    Extract scores from the evaluation text.
    
    Args:
        evaluation_text: The evaluation text containing scores
        content_type: Type of content ("video-game", "animation", or "website")
    
    Returns:
        A dictionary with score names as keys and score values as values,
        or None if not all criteria have been found
    """
    scores = {}
    
    # Look for scores in the boxed answer format
    # Try to find content within \boxed{} or <answer></answer> tags
    answer_match = re.search(r'<answer>(.*?)</answer>', evaluation_text, re.DOTALL)
    
    if answer_match:
        answer_content = answer_match.group(1).strip()
    else:
        # If no answer tags found, return None
        return None
    
    # Split the answer content by commas to get individual scores
    # Handle both formats with and without spaces
    score_values = [s.strip() for s in answer_content.split(',')]
    
    # Map the scores to the appropriate keys based on position and content type
    if content_type == "video-game":
        score_keys = [
            'DescriptionScore',  # Description Fidelity
            'VisualScore',       # Visual Design
            'GameplayScore',     # Gameplay Quality
            'AIScore',           # AI Player Quality
            'BehaviorScore',     # Behavior Correctness
            'AudioScore'         # Audio Quality (if present)
        ]
    elif content_type == "animation":
        score_keys = [
            'DescriptionScore',  # Description Fidelity
            'VisualScore',       # Visual Design
            'AnimationScore',    # Animation Smoothness
            'CreativityScore',   # Creativity and Originality
            'BehaviorScore',     # Behavior Correctness
            'AudioScore'         # Audio Quality (if present)
        ]
    else:  # website
        score_keys = [
            'DescriptionScore',       # Description Fidelity
            'VisualScore',            # Visual Design
            'UserExperienceScore',    # User Experience
            'FunctionalityScore',     # Functionality
            'BehaviorScore',          # Behavior Correctness
            'AudioScore'              # Audio Quality (if present)
        ]
    
    # Check if we have enough scores
    expected_scores = len(score_keys)
    if len(score_values) < expected_scores - 1:  # -1 because AudioScore might be optional
        # Not all criteria found
        print(f"Missing criteria: expected at least {expected_scores-1} scores, got {len(score_values)}")
        return None
    
    # Process and assign scores to keys
    processed_scores = []
    for i, value in enumerate(score_values):
        if i < len(score_keys):
            try:
                # Handle scores in format "8/10"
                if '/' in value:
                    numerator = float(value.split('/')[0].strip())
                    processed_scores.append(numerator)
                else:
                    # Handle regular numeric scores
                    processed_scores.append(float(value))
            except ValueError:
                # If conversion fails, consider this criterion as not found
                print(f"Failed to convert score: {value}")
                return None
    
    # Assign processed scores to keys
    for i, score in enumerate(processed_scores):
        if i < len(score_keys):
            scores[score_keys[i]] = score
    
    # Calculate mean score if scores were extracted
    if scores:
        total = sum(scores.values())
        scores['MeanScore'] = round(total / len(scores), 2)
    
    return scores

def review_evaluations_with_coding_agent(system_prompt, llm, evaluations, content_description, content_type="video-game", max_tokens=1024, seed=42, description_feedbacks=None, temp_feedback=0.7, temp_coding=0.7, top_p=0.95, top_k=-1, repetition_penalty=1.0, with_AI=True):
    """
    Have the coding agent review all evaluations and summarize the main recurring points to make a final score.
    
    Args:
        system_prompt: System prompt for the LLM
        llm: The coding LLM
        evaluations: List of evaluation texts
        content_description: Description of the content
        content_type: Type of content ("video-game", "animation", or "website")
        max_tokens: Maximum number of tokens to generate
        seed: Random seed for generation
        temp_feedback: Temperature for generation
        description_feedbacks: List of description feedbacks (optional)
    
    Returns:
        Final scores determined by the coding agent, the review text, and updated seed
    """
    print(f"Having coding agent review evaluations...")
    
    # Get the prompt from prompts.py
    prompt = prompts.get_review_evaluations_prompt(
        content_description, 
        content_type, 
        evaluations, 
        llm.content_has_audio,
        description_feedbacks,
        temp_feedback,
        with_AI=with_AI
    )

    if system_prompt is not None:
        messages = [{"role": "system", "content": system_prompt}]
    else:
        messages = []
    
    messages.append({"role": "user", "content": prompt})
    
    # Generate the response
    sampling_params = SamplingParams(
        temperature=temp_coding,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        max_tokens=max_tokens,
        seed=seed
    )
    
    # Increment seed for next generation
    seed += 1
    
    # Generate the review
    response = llm.generate(messages, sampling_params)
    review = response[0].outputs[0].text
    
    # Extract scores from the review
    scores = extract_scores_from_evaluation(review, content_type)
    
    return scores, review, seed

def extract_scores_from_evaluations(evaluations, content_type="video-game"):
    """
    Extract and average scores from multiple evaluation texts.
    
    Args:
        evaluations: List of evaluation texts
    
    Returns:
        A dictionary with score names as keys and averaged score values as values
    """
    all_scores = []
    
    # Extract scores from each evaluation
    for i, evaluation in enumerate(evaluations):
        scores = extract_scores_from_evaluation(evaluation, content_type)
        if scores is not None:
            print(f"Evaluation {i+1}: Valid scores extracted")
            all_scores.append(scores)
        else:
            print(f"Evaluation {i+1}: No valid scores found or missing criteria")
    
    # If no valid scores were found, return a dictionary with -1 for each criterion based on content type
    if not all_scores:
        print("No valid scores found in any evaluation")
        if content_type == "video-game":
            return {
                'DescriptionScore': -1,
                'VisualScore': -1,
                'GameplayScore': -1,
                'AIScore': -1,
                'BehaviorScore': -1,
                'AudioScore': -1,
                'MeanScore': -1
            }
        elif content_type == "animation":
            return {
                'DescriptionScore': -1,
                'VisualScore': -1,
                'AnimationScore': -1,
                'CreativityScore': -1,
                'BehaviorScore': -1,
                'AudioScore': -1,
                'MeanScore': -1
            }
        else:  # website
            return {
                'DescriptionScore': -1,
                'VisualScore': -1,
                'UserExperienceScore': -1,
                'FunctionalityScore': -1,
                'BehaviorScore': -1,
                'AudioScore': -1,
                'MeanScore': -1
            }
    
    # Initialize the result dictionary
    result = {}
    
    # Get all unique keys from all score dictionaries
    all_keys = set()
    for scores in all_scores:
        all_keys.update(scores.keys())
    
    # Remove MeanScore from all_keys as we'll calculate it separately
    if 'MeanScore' in all_keys:
        all_keys.remove('MeanScore')
    
    print("\nAveraged scores calculation details:")
    # Calculate the average for each key
    for key in all_keys:
        # Get all non-None values for this key
        values = [scores[key] for scores in all_scores if key in scores]
        
        if values:
            # Calculate the average if there are values
            result[key] = round(sum(values) / len(values), 2)
            
            # Print the detailed calculation for debugging
            values_str = "+".join([str(val) for val in values])
            print(f"{key}: ({values_str})/{len(values)}={result[key]}")
        else:
            # If no values for this key, set to -1
            result[key] = -1
            print(f"{key}: No valid values found, setting to -1")
    
    # Calculate mean score if scores were extracted
    if result:
        total = sum(value for key, value in result.items() if key != 'MeanScore')
        result['MeanScore'] = round(total / len(result), 2)
        
        # Print the detailed calculation for the mean score
        score_details = "+".join([f"{key}={result[key]}" for key in result if key != 'MeanScore'])
        print(f"MeanScore: ({score_details})/{len(result)}={result['MeanScore']}")
    
    return result

def update_csv_with_results(csv_path, row_index, scores, final_code, dir_name):
    """
    Update the CSV file with scores and final code.
    
    Args:
        csv_path: Path to the CSV file
        row_index: Row index to update (1-based)
        scores: Dictionary of scores
        final_code: Final HTML code
        dir_name: Current directory name
    """
    try:
        # Read the CSV file
        df = pd.read_csv(csv_path)
        
        # Check if the row exists
        if len(df) < row_index:
            raise ValueError(f"CSV file has only {len(df)} rows, but row_index is {row_index}")
        
        # Add columns if they don't exist
        for score_name, score_value in scores.items():
            column_name = f"{dir_name}_{score_name}"
            if column_name not in df.columns:
                df[column_name] = None
            
            # Check if the cell is already filled
            if pd.notna(df.at[row_index - 1, column_name]):
                print(f"Warning: Column '{column_name}' at row {row_index} already has a value. Overwriting.")
            
            # Update the score
            df.at[row_index - 1, column_name] = score_value
        
        # Add code column if it doesn't exist
        code_column = f"{dir_name}_Code"
        if code_column not in df.columns:
            df[code_column] = None
        
        # Check if the code cell is already filled
        if pd.notna(df.at[row_index - 1, code_column]):
            print(f"Warning: Column '{code_column}' at row {row_index} already has a value. Overwriting.")
        
        # Update the code
        df.at[row_index - 1, code_column] = final_code
        
        # Save the updated CSV
        df.to_csv(csv_path, index=False)
        print(f"Successfully updated CSV file with scores and code at row {row_index}")
        
    except Exception as e:
        print(f"Error updating CSV file: {e}")
        raise

def save_resume_state(args, iteration, current_code, conversation_history, current_seed, 
                     asset_tree=None, selected_assets=None, asset_info=None):
    """
    Save the current state for resuming later.
    
    Args:
        args: Command line arguments
        iteration: Current iteration number
        current_code: Current HTML code
        conversation_history: Current conversation history
        current_seed: Current seed value
        asset_tree: Asset tree structure
        selected_assets: List of selected assets
        asset_info: Asset information
    """
    try:
        resume_state = {
            "iteration": iteration,
            "current_code": current_code,
            "conversation_history": conversation_history,
            "current_seed": current_seed,
            "asset_tree": asset_tree,
            "selected_assets": selected_assets or [],
            "asset_info": asset_info,
            "content_description": args.content_description,
            "content_type": args.content_type,
            "enable_audio": args.enable_audio
        }
        
        resume_file_path = os.path.join(args.output_dir, "resume_state.json")
        with open(resume_file_path, "w", encoding="utf-8") as f:
            json.dump(resume_state, f, indent=2, ensure_ascii=False)
        
        print(f"Resume state saved to: {resume_file_path}")
        
    except Exception as e:
        print(f"Warning: Failed to save resume state: {e}")

def load_resume_state(args):
    """
    Load the resume state if it exists.
    
    Args:
        args: Command line arguments
    
    Returns:
        Resume state dictionary or None if not found/invalid
    """
    try:
        resume_file_path = os.path.join(args.output_dir, "resume_state.json")
        if not os.path.exists(resume_file_path):
            return None
        
        with open(resume_file_path, "r", encoding="utf-8") as f:
            resume_state = json.load(f)
        
        # Validate that the resume state matches current arguments
        if (resume_state.get("content_description") != args.content_description or
            resume_state.get("content_type") != args.content_type or
            resume_state.get("enable_audio") != args.enable_audio):
            print("Warning: Resume state doesn't match current arguments, ignoring")
            return None
        
        print(f"Resume state loaded from: {resume_file_path}")
        return resume_state
        
    except Exception as e:
        print(f"Warning: Failed to load resume state: {e}")
        return None

def check_resume_state(args):
    """
    Check what stage we can resume from.
    
    Args:
        args: Command line arguments
    
    Returns:
        Dictionary with resume information or None if no resume possible
    """
    if not args.auto_resume:
        return None
    
    print("Checking for auto-resume possibilities...")
    
    # Check for final completion
    final_html_path = os.path.join(args.output_dir, "final_content.html")
    final_video_path = os.path.join(args.output_dir, "final_content.mp4")
    final_audio_path = os.path.join(args.output_dir, "final_content.wav") if args.enable_audio else None
    
    if os.path.exists(final_html_path) and os.path.exists(final_video_path):
        if not args.enable_audio or (final_audio_path and os.path.exists(final_audio_path)):
            print("Found completed final content files - task already completed!")
            return {"stage": "completed", "final_html_path": final_html_path, "final_video_path": final_video_path}
    
    if os.path.exists(final_html_path) and not os.path.exists(final_video_path):
        # Record final content video and capture console logs
        final_video_path = os.path.join(args.output_dir, "final_content.mp4")
        final_audio_path = os.path.splitext(final_video_path)[0] + ".wav" if args.enable_audio else None
        final_video_path, final_console_logs_path, error_free_console_log = record_content(
            final_html_path,
            final_video_path,
            args.video_duration,
            args.video_fps,
            enable_audio=args.enable_audio,
            sampling_rate=args.sampling_rate,
            display_num=args.display_num,
            monitor_source=args.monitor_source,
            server_root=args.server_root,
            chromium_path=args.chromium_path,
        )
        return {"stage": "completed", "final_html_path": final_html_path, "final_video_path": final_video_path}

    # Check for iteration resume - find the highest iteration with both HTML and video files
    max_iteration = 0
    temp_files = []
    
    # Look for temp_content_{i}.html and temp_content_{i}.mp4 files
    for i in range(args.max_iterations):
        temp_html_exists = os.path.exists(os.path.join(args.output_dir, f"temp_content_{i}.html"))
        temp_video_exists = os.path.exists(os.path.join(args.output_dir, f"temp_content_{i}.mp4"))
        temp_audio_exists = not args.enable_audio or os.path.exists(os.path.join(args.output_dir, f"temp_content_{i}.wav"))
        
        if temp_html_exists and temp_video_exists and temp_audio_exists:
            max_iteration = i
            temp_files.append(i)
    
    # Load resume state
    resume_state = load_resume_state(args)
    
    if max_iteration >= 0 and resume_state:
        print(f"Found resume point at iteration {max_iteration}")
        return {
            "stage": "iteration",
            "iteration": max_iteration,
            "resume_state": resume_state,
            "temp_files": temp_files
        }
    elif max_iteration >= 0:
        print(f"Found temp files up to iteration {max_iteration} but no valid resume state")
        return {
            "stage": "partial",
            "iteration": max_iteration,
            "temp_files": temp_files
        }
    
    print("No resume state found, starting fresh")
    return None

def _generate_single_initial_content(i, args, system_prompt, coding_model, content_description, content_type, asset_info, current_seed):
    """
    Generate a single initial content candidate.
    
    Args:
        i: Candidate index
        args: Command line arguments
        system_prompt: System prompt for the model
        coding_model: The coding model
        content_description: Description of the content to create
        content_type: Type of content to generate
        asset_info: Asset information
        current_seed: Base seed value
    
    Returns:
        Tuple of (code, conversation_history)
    """
    print(f"Generating initial candidate {i+1}...")
    
    # Generate with different seed for each candidate
    candidate_seed = current_seed + i
    
    initial_code_raw, conversation_history, _ = generate_initial_content(
        system_prompt,
        coding_model,
        content_description,
        content_type=content_type,
        max_tokens=args.generation_max_tokens,
        asset_info=asset_info,
        seed=candidate_seed,
        temp_coding=args.temp_coding,
        top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty,
        cdn_allowed=args.cdn_allowed
    )
    
    initial_code, success_code, _ = extract_html_code(initial_code_raw, current_code=None, use_memory=False)
    return (initial_code, conversation_history)

def generate_k_initial_content(args, system_prompt, coding_model, content_description, content_type, asset_info, current_seed, k):
    """
    Generate k different initial content versions in parallel.
    
    Args:
        args: Command line arguments
        system_prompt: System prompt for the model
        coding_model: The coding model
        content_description: Description of the content to create
        content_type: Type of content to generate
        asset_info: Asset information
        current_seed: Current seed value
        k: Number of candidates to generate
    
    Returns:
        List of (code, conversation_history) tuples and updated seed
    """
    print(f"Generating {k} initial content candidates in parallel...")
    
    with ThreadPoolExecutor(max_workers=k) as executor:
        # Submit all k tasks simultaneously
        futures = [
            executor.submit(
                _generate_single_initial_content,
                i, args, system_prompt, coding_model, content_description, 
                content_type, asset_info, current_seed+i
            )
            for i in range(k)
        ]
            
        # Collect results as they complete
        candidates = []
        for i, future in enumerate(futures):
            try:
                result = future.result()
                candidates.append(result)
                print(f"Initial candidate {i+1}/{k} completed")
            except Exception as e:
                print(f"Error generating initial candidate {i+1}: {e}")
                # Create a fallback candidate
                candidates.append(("", []))
    
    # Update seed after all generations
    current_seed += k
    
    return candidates, current_seed

def _generate_single_feedback(i, args, system_prompt_feedback, evaluator_model, video_paths, audio_paths, content_description, console_logs_paths, current_seed, current_codes=None, asset_info=None, system_prompt=None, coding_model=None):
    """
    Generate a single feedback candidate.
    
    Args:
        i: Candidate index
        args: Command line arguments
        system_prompt_feedback: System prompt for feedback model
        evaluator_model: The evaluator model
        video_paths: List of video paths for each candidate
        audio_paths: List of audio paths for each candidate
        content_description: Description of the content
        console_logs_paths: List of console logs paths for each candidate
        current_seed: Base seed value
        current_codes: List of current codes for each candidate (optional, for technical feedback)
        asset_info: Asset information for prompts (optional)
        system_prompt: System prompt for coding model (optional, for technical feedback)
        coding_model: The coding model (optional, for technical feedback)
    
    Returns:
        Feedback string and updated seed
    """
    print(f"Generating feedback {i+1}...")
    
    # Generate with different seed for each feedback
    feedback_seed = current_seed + i
    
    # Get feedback from evaluator model
    if args.no_feedback:
        evaluator_feedback = ""
    else:
        evaluator_feedback, feedback_seed = get_evaluator_feedback(
            system_prompt_feedback,
            evaluator_model,
            video_paths[i] if not args.without_video_feedback else None,
            content_description,
            audio_path=audio_paths[i] if audio_paths else None,
            content_type=args.content_type,
            max_tokens=args.feedback_max_tokens,
            console_logs_path=console_logs_paths[i] if console_logs_paths else None,
            seed=feedback_seed,
            exclude_technical=False,
            current_code=current_codes[i] if current_codes else None,
            asset_info=asset_info,
            temp_feedback=args.temp_feedback,
            top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty
        )

    # Technical feedback is now handled in _generate_single_improvement() 
    # to keep video-based feedback separate from code-based feedback

    # Handle description feedback if needed
    if args.description_feedback:
        print(f"Using description feedback for candidate {i+1}...")
        description_feedback, feedback_seed = get_description_feedback(
            system_prompt_feedback,
            evaluator_model,
            video_paths[i] if not args.without_video_feedback else None,
            audio_path=audio_paths[i] if audio_paths else None,
            max_tokens=args.feedback_max_tokens,
            seed=feedback_seed,
            temp_feedback=args.temp_feedback,
            top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty
        )
        evaluator_feedback = f"{evaluator_feedback}{description_feedback}"
    
    
    return evaluator_feedback, feedback_seed

def generate_k_feedback(args, system_prompt_feedback, evaluator_model, video_paths, audio_paths, content_description, console_logs_paths, current_seed, k, current_codes=None, asset_info=None, system_prompt=None, coding_model=None):
    """
    Generate k feedback versions in parallel.
    
    Args:
        args: Command line arguments
        system_prompt_feedback: System prompt for feedback model
        evaluator_model: The evaluator model
        video_paths: List of video paths for each candidate
        audio_paths: List of audio paths for each candidate
        content_description: Description of the content
        console_logs_paths: List of console logs paths for each candidate
        current_seed: Current seed value
        k: Number of feedback to generate
        current_codes: List of current codes for each candidate (optional, for technical feedback)
        asset_info: Asset information for prompts (optional)
        system_prompt: System prompt for coding model (optional, for technical feedback)
        coding_model: The coding model (optional, for technical feedback)
    
    Returns:
        List of feedback strings and updated seed
    """
    print(f"Generating {k} feedback versions in parallel...")
    
    with ThreadPoolExecutor(max_workers=k) as executor:
        # Submit all k tasks simultaneously
        futures = [
            executor.submit(
                _generate_single_feedback,
                i, args, system_prompt_feedback, evaluator_model, video_paths, 
                audio_paths, content_description, console_logs_paths, current_seed+i,
                current_codes, asset_info, system_prompt, coding_model
            )
            for i in range(k)
        ]
        
        # Collect results as they complete
        feedbacks = []
        for i, future in enumerate(futures):
            try:
                result = future.result()
                feedbacks.append(result[0])  # Extract feedback from (feedback, seed) tuple
                print(f"Feedback {i+1}/{k} completed")
            except Exception as e:
                print(f"Error generating feedback {i+1}: {e}")
                # Create a fallback feedback
                feedbacks.append("")
    
    # Update seed after all generations
    current_seed += k
    
    return feedbacks, current_seed

def _generate_single_improvement(i, args, system_prompt, coding_model, current_codes, video_paths, content_description, conversation_histories, feedbacks, console_logs_paths, asset_info, current_seed, iteration, memories):
    """
    Generate a single improvement candidate.
    
    Args:
        i: Candidate index
        args: Command line arguments
        system_prompt: System prompt for the model
        coding_model: The coding model
        current_codes: List of current codes for each candidate
        video_paths: List of video paths for each candidate
        content_description: Description of the content
        conversation_histories: List of conversation histories for each candidate
        feedbacks: List of feedback for each candidate
        console_logs_paths: List of console logs paths for each candidate
        asset_info: Asset information
        current_seed: Base seed value
        iteration: Current iteration number
        memories: Memory entries
    
    Returns:
        Tuple of (improved_code, conversation_history, notes, improved_code_raw)
    """
    print(f"Generating improvement candidate {i+1}...")
    
    # Generate with different seed for each candidate
    candidate_seed = current_seed + i
    
    # Handle technical feedback if needed
    evaluator_feedback = feedbacks[i] if feedbacks else ""
    
    # Description feedback are now handled in _generate_single_feedback()
    # to keep video-based feedback separate from code-based feedback
    
    improved_code_raw, conversation_history, _ = generate_improvement(
        system_prompt=system_prompt,
        llm=coding_model,
        current_code=current_codes[i],
        video_path=video_paths[i] if not args.without_video_feedback else None,
        content_description=content_description,
        content_type=args.content_type,
        max_tokens=args.generation_max_tokens,
        i=iteration,
        asset_info=asset_info,
        evaluator_feedback=evaluator_feedback,
        console_logs_path=console_logs_paths[i] if console_logs_paths else None,
        seed=candidate_seed,
        early_exit=args.early_exit,
        temp_coding=args.temp_coding,
        search_replace=args.search_replace,
        memories=memories if args.use_memory else None,
        top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty,
        cdn_allowed=args.cdn_allowed
    )
    
    improved_code, success_code, notes = extract_html_code(improved_code_raw, current_codes[i], args.use_memory)
    return (improved_code, conversation_history, notes, improved_code_raw)

def generate_k_improvements(args, system_prompt, coding_model, current_codes, video_paths, content_description, conversation_histories, feedbacks, console_logs_paths, asset_info, current_seed, iteration, memories, k):
    """
    Generate k improvement versions in parallel.
    
    Args:
        args: Command line arguments
        system_prompt: System prompt for the model
        coding_model: The coding model
        current_codes: List of current codes for each candidate
        video_paths: List of video paths for each candidate
        content_description: Description of the content
        conversation_histories: List of conversation histories for each candidate
        feedbacks: List of feedback for each candidate
        console_logs_paths: List of console logs paths for each candidate
        asset_info: Asset information
        current_seed: Current seed value
        iteration: Current iteration number
        memories: Memory entries
        k: Number of improvements to generate
    
    Returns:
        List of (improved_code, conversation_history, notes, improved_code_raw) tuples and updated seed
    """
    print(f"Generating {k} improvement candidates in parallel...")
    
    with ThreadPoolExecutor(max_workers=k) as executor:
        # Submit all k tasks simultaneously
        futures = [
            executor.submit(
                _generate_single_improvement,
                i, args, system_prompt, coding_model, current_codes, video_paths,
                content_description, conversation_histories, feedbacks, console_logs_paths,
                asset_info, current_seed+i, iteration, memories
            )
            for i in range(k)
        ]
        
        # Collect results as they complete
        candidates = []
        for i, future in enumerate(futures):
            try:
                result = future.result()
                candidates.append(result)
                print(f"Improvement candidate {i+1}/{k} completed")
            except Exception as e:
                print(f"Error generating improvement candidate {i+1}: {e}")
                # Create a fallback candidate
                candidates.append(("", [], None, ""))
    
    # Update seed after all generations
    current_seed += k
    
    return candidates, current_seed

def filter_candidates_by_console_errors(candidates, console_logs_list, args, current_code=None, current_code_console_logs=None, best_of_k_no_console_error=False):
    """
    Filter candidates based on console errors if best_of_k_no_console_error is enabled.
    
    Args:
        candidates: List of candidate tuples
        console_logs_list: List of console logs content for each candidate
        args: Command line arguments
        current_code: Current code (optional, for compare_to_current_code mode)
        current_code_console_logs: Console logs for current code (optional)
    
    Returns:
        Tuple of (filtered_candidates, filtered_console_logs_list, filtered_indices)
        where filtered_indices maps back to original candidate indices
    """
    if not best_of_k_no_console_error:
        # If filtering is disabled, return all candidates with their original indices
        original_indices = list(range(len(candidates)))
        return candidates, console_logs_list, original_indices
    
    print("Filtering candidates based on console errors...")
    
    # Collect all console logs (including current code if applicable)
    all_console_logs = []
    all_candidates = []
    all_indices = []  # Track original indices
    
    # Add current code if compare_to_current_code is enabled
    if args.compare_to_current_code and current_code is not None and current_code_console_logs is not None:
        all_console_logs.append(current_code_console_logs)
        all_candidates.append(("current_code", None, None))  # Placeholder for current code
        all_indices.append(-1)  # Special index for current code
    
    # Add generated candidates
    for i, candidate in enumerate(candidates):
        all_console_logs.append(console_logs_list[i])
        all_candidates.append(candidate)
        all_indices.append(i)
    
    # Check which candidates have no SEVERE errors
    clean_candidates = []
    clean_console_logs = []
    clean_indices = []
    
    for i, console_logs in enumerate(all_console_logs):
        has_severe_error = "SEVERE" in console_logs if console_logs else False
        if not has_severe_error:
            clean_candidates.append(all_candidates[i])
            clean_console_logs.append(console_logs)
            clean_indices.append(all_indices[i])
    
    print(f"Found {len(clean_candidates)} candidates without SEVERE errors out of {len(all_candidates)} total candidates")
    
    # If some candidates have no SEVERE errors, use only those
    if clean_candidates:
        print("Using only candidates without SEVERE errors")
        # Separate current code from generated candidates if present
        if args.compare_to_current_code and clean_indices and clean_indices[0] == -1:
            # Current code is clean, remove it from the candidates list but keep it for comparison
            filtered_candidates = clean_candidates[1:]  # Remove current code placeholder
            filtered_console_logs = clean_console_logs[1:]
            filtered_indices = clean_indices[1:]
        else:
            # No current code or current code has SEVERE errors
            filtered_candidates = clean_candidates
            filtered_console_logs = clean_console_logs
            filtered_indices = clean_indices
        
        return filtered_candidates, filtered_console_logs, filtered_indices
    else:
        # All candidates have SEVERE errors, proceed with all candidates
        print("All candidates have SEVERE errors, proceeding with all candidates")
        original_indices = list(range(len(candidates)))
        return candidates, console_logs_list, original_indices

def select_best_candidate(args, candidates, current_seed, iteration, system_prompt, system_prompt_eval, coding_model, evaluator_model, current_code=None, console_logs_list=None, current_code_console_logs=None, best_of_k_no_console_error=False):
    """
    Select the best candidate from k candidates using tournament-style evaluation.
    
    Args:
        args: Command line arguments
        candidates: List of (code, conversation_history, notes) tuples
        current_seed: Current seed value
        iteration: Current iteration number
        system_prompt: System prompt for coding
        system_prompt_eval: System prompt for evaluation
        coding_model: The coding model
        evaluator_model: The evaluator model
        current_code: Current code to compare against (optional, for compare_to_current_code mode)
        console_logs_list: List of console logs content for each candidate (optional)
        current_code_console_logs: Console logs for current code (optional)
    
    Returns:
        Index of best candidate, updated seed
    """
    k = len(candidates)
    
    # If compare_to_current_code is enabled and we have current_code, we always do comparison
    # even with k=1 (compare current_code vs candidate 0)
    # Only skip comparison if compare_to_current_code is disabled AND we only have 1 candidate
    if k == 1 and (not args.compare_to_current_code or current_code is None):
        return 0, current_seed
    
    print(f"Selecting best candidate from {k} options using tournament evaluation...")
    
    # Create temporary files and videos for each candidate
    candidate_files = []
    candidate_videos = []
    candidate_audios = []
    candidate_console_logs = []
    
    # If compare_to_current_code is enabled, add current code as the first entry
    current_code_console_logs_content = None
    if args.compare_to_current_code and current_code is not None:
        print("Adding current code to comparison...")
        # Create temporary HTML file for current code
        current_code_file = os.path.join(args.output_dir, f"temp_current_code_{iteration}.html")
        with open(current_code_file, "w", encoding="utf-8") as f:
            f.write(current_code)
        candidate_files.append(current_code_file)
        
        # Record video for current code
        current_code_video = os.path.join(args.output_dir, f"temp_current_code_{iteration}.mp4")
        current_code_audio = os.path.splitext(current_code_video)[0] + ".wav" if args.enable_audio else None
        
        try:
            print(f"Recording content-current")
            video_path, console_logs_path, error_free_console_log = record_content(
                current_code_file,
                current_code_video,
                args.video_duration,
                args.video_fps,
                enable_audio=args.enable_audio,
                sampling_rate=args.sampling_rate,
                display_num=args.display_num,
                monitor_source=args.monitor_source,
                server_root=args.server_root,
                chromium_path=args.chromium_path,
            )
            candidate_videos.append(video_path)
            candidate_audios.append(current_code_audio)
            
            # Read console logs content for filtering
            if console_logs_path and os.path.exists(console_logs_path):
                with open(console_logs_path, "r", encoding="utf-8") as f:
                    current_code_console_logs_content = f.read()
            else:
                current_code_console_logs_content = ""
                
        except Exception as e:
            print(f"Error recording video for current code: {e}")
            print(f"Skipping comparison due to video recording failure")
            # If we can't record video for current code, skip comparison entirely
            return 0, current_seed
    
    for i, candidate in enumerate(candidates):
        code = candidate[0]  # First element is always the code
        # Create temporary HTML file
        temp_file = os.path.join(args.output_dir, f"temp_candidate_{iteration}_{i}.html")
        with open(temp_file, "w", encoding="utf-8") as f:
            f.write(code)
        candidate_files.append(temp_file)
        
        # Record video for this candidate
        temp_video = os.path.join(args.output_dir, f"temp_candidate_{iteration}_{i}.mp4")
        temp_audio = os.path.splitext(temp_video)[0] + ".wav" if args.enable_audio else None
        
        print(f"Recording content-{i}")
        video_path, console_logs_path, error_free_console_log = record_content(
            temp_file,
            temp_video,
            args.video_duration,
            args.video_fps,
            enable_audio=args.enable_audio,
            sampling_rate=args.sampling_rate,
            display_num=args.display_num,
            monitor_source=args.monitor_source,
            server_root=args.server_root,
            chromium_path=args.chromium_path,
        )
        candidate_videos.append(video_path)
        candidate_audios.append(temp_audio)
        
        # Read console logs content for filtering
        if console_logs_path and os.path.exists(console_logs_path):
            with open(console_logs_path, "r", encoding="utf-8") as f:
                console_logs_content = f.read()
        else:
            console_logs_content = ""
        candidate_console_logs.append(console_logs_content)
    
    # Apply console error filtering if enabled
    if best_of_k_no_console_error:
        # Filter candidates based on console errors
        filtered_candidates, filtered_console_logs, filtered_indices = filter_candidates_by_console_errors(
            candidates, candidate_console_logs, args, 
            current_code=current_code if args.compare_to_current_code else None,
            current_code_console_logs=current_code_console_logs_content,
            best_of_k_no_console_error=best_of_k_no_console_error
        )
        
        # If filtering was applied and candidates were filtered out, we need to adjust our approach
        if len(filtered_candidates) < len(candidates):
            print(f"Console error filtering reduced candidates from {len(candidates)} to {len(filtered_candidates)}")
            
            # If no candidates remain after filtering, fall back to original candidates
            if not filtered_candidates:
                print("No candidates passed console error filtering, using all original candidates")
                filtered_candidates = candidates
                filtered_console_logs = candidate_console_logs
                filtered_indices = list(range(len(candidates)))
            
            # Update our working lists to use filtered candidates
            # We need to rebuild the candidate files and videos for the filtered set
            original_candidate_files = candidate_files[:]
            original_candidate_videos = candidate_videos[:]
            original_candidate_audios = candidate_audios[:]
            
            # Clear and rebuild with filtered candidates
            candidate_files = []
            candidate_videos = []
            candidate_audios = []
            
            # Add current code if it was included and passed filtering
            current_code_included = False
            if args.compare_to_current_code and current_code is not None:
                # Check if current code passed filtering (index -1 in filtered_indices)
                if -1 in filtered_indices:
                    candidate_files.append(original_candidate_files[0])
                    candidate_videos.append(original_candidate_videos[0])
                    candidate_audios.append(original_candidate_audios[0])
                    current_code_included = True
            
            # Add filtered generated candidates
            for i, original_idx in enumerate(filtered_indices):
                if original_idx >= 0:  # Skip current code (index -1)
                    # Adjust index for current code offset
                    file_idx = original_idx + (1 if args.compare_to_current_code and current_code is not None else 0)
                    candidate_files.append(original_candidate_files[file_idx])
                    candidate_videos.append(original_candidate_videos[file_idx])
                    candidate_audios.append(original_candidate_audios[file_idx])
            
            # Update candidates list to use filtered candidates
            candidates = filtered_candidates
    
    # Tournament-style selection: compare pairs and select winner
    current_best = 0
    
    # Determine the total number of entries (including current code if enabled)
    total_entries = len(candidate_files)
    
    # If compare_to_current_code is enabled, we need to adjust the logic
    # Index 0 = current code, Index 1+ = generated candidates
    # We want to return the index relative to the original candidates list
    
        
    # Set up args for relative evaluation
    eval_args = argparse.Namespace()
    for attr in ['content_description', 'content_type', 'enable_audio', 'sampling_rate', 
                'evaluation_max_tokens', 'coding_evaluation', 'description_feedback', 
                'feedback_max_tokens', 'temp_feedback', 'generation_max_tokens', 'temp_coding',
                'top_p', 'top_k', 'repetition_penalty', 'output_dir', 'multiround']:
        setattr(eval_args, attr, getattr(args, attr))

    for i in range(1, total_entries):
        # Skip candidates with failed video recordings
        if candidate_videos[current_best] is None or candidate_videos[i] is None:
            print(f"Skipping comparison between candidate {current_best} and {i} due to missing video")
            continue
            
        print(f"Comparing candidate {current_best} vs candidate {i}...")

        name1=os.path.splitext(os.path.basename(candidate_files[current_best]))[0]
        name2=os.path.splitext(os.path.basename(candidate_files[i]))[0]
        
        try:
            # Perform relative evaluation
            comparison_results, current_seed = evaluate_relative(
                eval_args, args.output_dir, args.output_dir, 
                coding_model=coding_model,
                evaluator_model=evaluator_model,
                system_prompt=system_prompt,
                system_prompt_eval=system_prompt_eval,
                current_seed=current_seed,
                file1=name1, file2=name2,
            )
            
            if comparison_results and 'OverallWinner' in comparison_results:
                winner_name = comparison_results['OverallWinner']
                if name2 in winner_name or 'B' in winner_name:
                    # Candidate i wins
                    print(f"Candidate {i} wins against candidate {current_best}")
                    current_best = i
                else:
                    print(f"Candidate {current_best} wins against candidate {i}")
            else:
                print(f"No clear winner, keeping candidate {current_best}")
                
        except Exception as e:
            print(f"Error in relative evaluation: {e}")
            print(f"Keeping candidate {current_best}")
    
    # Adjust the return index if compare_to_current_code is enabled
    if args.compare_to_current_code and current_code is not None:
        # If current code was included, indices are shifted:
        # Index 0 = current code, Index 1+ = generated candidates
        if current_best == 0:
            # Current code won, return -1 to indicate no change needed
            print(f"Current code selected as best (no improvement needed)")
            return -1, current_seed
        else:
            # Generated candidate won, adjust index back to original candidate list
            adjusted_index = current_best - 1
            print(f"Selected candidate {adjusted_index} as the best (was index {current_best} in comparison)")
            return adjusted_index, current_seed
    else:
        print(f"Selected candidate {current_best} as the best")
        return current_best, current_seed

def cleanup_candidates(candidate_files):
    """
    Clean up temporary candidate files.
    
    Args:
        candidate_files: List of file paths to clean up
    """
    for file_path in candidate_files:
        try:
            if os.path.exists(file_path):
                os.remove(file_path)
        except Exception as e:
            print(f"Warning: Could not remove {file_path}: {e}")

def main():
    """Main function to run the VLM agent for iterative game building."""
    args = parse_arguments()
    
    # Ensure either content_description or dataset is provided
    if not args.content_description and not args.dataset:
        raise ValueError("Either --content_description or --dataset must be provided")
    
    # Process CSV dataset if provided
    df = None
    if args.dataset:
        args, df = process_csv_dataset(args)
    
    # Set random seed for reproducibility
    set_seed(args.seed)
    
    # Initialize the current seed for LLM generation
    current_seed = args.seed
    
    # Get system prompts based on model
    system_prompt = get_system_prompt(args.model_path)
    system_prompt_feedback = get_system_prompt(args.evaluator_model_path if args.use_separate_evaluator else args.model_path)
    system_prompt_eval = get_system_prompt(args.evaluator_model_path if args.use_separate_evaluator else args.model_path)
    
    # Create output directory if it doesn't exist
    os.makedirs(args.output_dir, exist_ok=True)
    
    # Check for resume state
    resume_info = check_resume_state(args)
    
    # Handle completed case
    if resume_info and resume_info["stage"] == "completed":
        print("Task already completed! Final files found:")
        print(f"- HTML: {resume_info['final_html_path']}")
        print(f"- Video: {resume_info['final_video_path']}")
        if args.enable_audio:
            final_audio_path = os.path.join(args.output_dir, "final_content.wav")
            if os.path.exists(final_audio_path):
                print(f"- Audio: {final_audio_path}")
        
        # Load and display final evaluation if it exists
        final_evaluation_path = os.path.join(args.output_dir, "final_evaluation.txt")
        if os.path.exists(final_evaluation_path):
            print(f"- Evaluation: {final_evaluation_path}")
            with open(final_evaluation_path, "r", encoding="utf-8") as f:
                evaluation_content = f.read()
            print("\nFinal Evaluation Summary:")
            print(evaluation_content)
        
        return
    
    # Define temporary file paths
    temp_html_path = os.path.join(args.output_dir, "temp_content.html")
    initial_html_path = os.path.join(args.output_dir, "initial_content.html")
    
    try:
        # Setup models based on configuration
        if args.use_separate_evaluator:
            print("Using separate models for coding and evaluation")
            
            # Setup coding model (can be text-only)
            coding_server_url = args.vllm_server_url
            coding_api_key = args.api_key
            print('<Coding-Model-setup>')
            coding_model = setup_model(
                args, 
                model_path=args.model_path,
                server_url=coding_server_url,
                api_key=coding_api_key
            )
            
            evaluator_server_url = args.evaluator_vllm_server_url or args.vllm_server_url
            evaluator_api_key = args.evaluator_api_key or args.api_key
            print('<Evaluator-Model-setup>')
            evaluator_model = setup_model(
                args, 
                model_path=args.evaluator_model_path,
                server_url=evaluator_server_url,
                api_key=evaluator_api_key
            )
            print(f"Using visual evaluator model: {args.evaluator_model_path}")
            if args.evaluator_vllm_server_url:
                print(f"Using evaluator model server URL: {evaluator_server_url}")
        else:
            coding_server_url = args.vllm_server_url
            coding_api_key = args.api_key
            print('<Coding-Model-setup>')
            # Use a single model for both coding and evaluation
            coding_model = evaluator_model = setup_model(args,
                model_path=args.model_path,
                server_url=coding_server_url,
                api_key=coding_api_key)
            print(f"Using single model for both coding and evaluation: {args.model_path}")
        
        # Setup assets agent (if specified, otherwise use coding model)
        if args.assets_agent_model_path:
            print("Using separate model for assets selection")
            assets_server_url = args.assets_agent_vllm_server_url or args.vllm_server_url
            assets_api_key = args.assets_agent_api_key or args.api_key
            print('<Assets-Agent-setup>')
            assets_model = setup_model(
                args,
                model_path=args.assets_agent_model_path,
                server_url=assets_server_url,
                api_key=assets_api_key
            )
            print(f"Using assets agent model: {args.assets_agent_model_path}")
            if args.assets_agent_vllm_server_url:
                print(f"Using assets agent server URL: {assets_server_url}")
        else:
            assets_model = coding_model
            print("Using coding model for assets selection")
                
        # Initialize variables for resume handling
        asset_tree = None
        selected_packs = []
        selected_assets = []
        asset_info = None
        conversation_history = None
        current_code = None
        start_iteration = 0
        memories = []
        
        # Initialize memory system if enabled
        if args.use_memory:
            print(f"Memory system enabled with memory length: {args.memory_len}")
            memories = load_memory_state(args.output_dir)
        
        # Handle resume cases
        if resume_info and resume_info["stage"] in ["iteration", "partial"]:
            resume_state = resume_info.get("resume_state")
            
            if resume_state:
                print(f"Resuming from saved state at iteration {resume_info['iteration']}")
                
                # Restore state from resume data
                current_code = resume_state["current_code"]
                conversation_history = resume_state["conversation_history"]
                current_seed = resume_state["current_seed"]
                asset_tree = resume_state.get("asset_tree")
                selected_assets = resume_state.get("selected_assets", [])
                asset_info = resume_state.get("asset_info")
                start_iteration = resume_info["iteration"]
                
                print(f"Restored state: iteration {resume_info['iteration']}, seed {current_seed}")
                print(f"Assets: {len(selected_assets)} selected assets")
                
                # Save current code to temp file for continuation
                with open(temp_html_path, "w", encoding="utf-8") as f:
                    f.write(current_code)
                    
            else:
                print(f"Found temp files up to iteration {resume_info['iteration']} but no resume state")
                print("Loading code from highest temp file and continuing without full state")
                
                # Load the highest iteration HTML file
                highest_iteration = resume_info["iteration"]
                temp_html_file = os.path.join(args.output_dir, f"temp_content_{highest_iteration}.html")
                
                if os.path.exists(temp_html_file):
                    with open(temp_html_file, "r", encoding="utf-8") as f:
                        current_code = f.read()
                    print(f"Loaded code from {temp_html_file}")
                    
                    # Copy to temp file for continuation
                    with open(temp_html_path, "w", encoding="utf-8") as f:
                        f.write(current_code)
                        
                    start_iteration = highest_iteration
                else:
                    print(f"Warning: Could not find {temp_html_file}, starting fresh")
                    resume_info = None
        
        # If not resuming, generate everything from scratch
        if not resume_info or resume_info["stage"] not in ["iteration", "partial"]:
                
            # Scan asset directory if provided
            if args.asset_dir:
                print(f"Scanning asset directory: {args.asset_dir}")
                
                # If select_assets or select_sample_packs is enabled, let the coding model choose asset packs and assets
                if args.select_assets or args.select_sample_packs:
                    if args.select_sample_packs:
                        print(f"Using model-selected asset packs mode with max {args.max_sample_packs} packs (using all assets in selected packs)")
                    else:
                        print(f"Using model-selected assets mode with max {args.max_sample_packs} packs and max {args.max_assets} assets")
                    
                    asset_tree, selected_assets, current_seed = select_assets_with_model(
                        system_prompt,
                        assets_model,
                        args.content_description,
                        args.asset_dir,
                        allowed_asset_types=args.allowed_asset_types,
                        max_sample_packs=args.max_sample_packs,
                        max_assets=args.max_assets,
                        max_tries=args.max_tries,
                        max_tokens=args.generation_max_tokens,
                        seed=current_seed,
                        select_sample_packs_only=args.select_sample_packs,
                        enable_audio=args.enable_audio,
                        assets_selection=args.assets_selection,
                        max_assets_per_pack=args.max_assets_per_pack,
                        temperature=args.temp_coding, top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty,
                    )
                else:
                    # Otherwise, use all assets
                    asset_tree = scan_asset_directory(args.asset_dir, allowed_asset_types=args.allowed_asset_types, assets_structure=args.assets_structure)
                
                if asset_tree:
                    print(f"Found assets in directory: {args.asset_dir}")
                    if args.select_assets:
                        print(f"Model selected {len(selected_assets)} assets")
                else:
                    print(f"No assets found in directory: {args.asset_dir}")
            
            # Getting info about assets
            if args.asset_dir and asset_tree:
                asset_info = prompts.get_asset_info(args.asset_dir, asset_tree, args.enable_audio, args.content_type)
                print(asset_info)

            # Generate initial content (with best-of-k if enabled)
            if args.best_of_k > 1 or args.initial_best_of_k > 1:
                if args.initial_best_of_k > 1:
                    best_of_k = args.initial_best_of_k
                else:
                    best_of_k = args.best_of_k
                print(f"Using best-of-{best_of_k} for initial content generation...")
                
                # Generate k initial candidates
                initial_candidates, current_seed = generate_k_initial_content(
                    args, system_prompt, coding_model, args.content_description,
                    args.content_type, asset_info,
                    current_seed, best_of_k
                )
                                
                # Select the best candidate
                best_idx, current_seed = select_best_candidate(
                    args, initial_candidates, current_seed, 0,
                    system_prompt=system_prompt, system_prompt_eval=system_prompt_eval,
                    coding_model=coding_model, evaluator_model=evaluator_model, current_code=None,
                    best_of_k_no_console_error=args.initial_best_of_k_no_console_error
                )
                
                # Use the best candidate
                initial_code, conversation_history = initial_candidates[best_idx]
                print(f"Selected initial candidate {best_idx} as the best")
                print(initial_code)
            else:
                # Standard single generation
                initial_code_raw, conversation_history, current_seed = generate_initial_content(
                    system_prompt,
                    coding_model, 
                    args.content_description,
                    content_type=args.content_type,
                    max_tokens=args.generation_max_tokens,
                    asset_info=asset_info,
                    seed=current_seed,
                    temp_coding=args.temp_coding,
                    top_p=args.top_p, top_k=args.top_k, repetition_penalty=args.repetition_penalty,
                    cdn_allowed=args.cdn_allowed
                )
                print(initial_code_raw)
                initial_code, success_code, _ = extract_html_code(initial_code_raw, current_code=None, use_memory=False)
                            
            # Save initial content
            with open(temp_html_path, "w", encoding="utf-8") as f:
                f.write(initial_code)
            with open(initial_html_path, "w", encoding="utf-8") as f:
                f.write(initial_code)

            current_code = initial_code
            
            start_iteration = 0
            
            # Save initial resume state
            if args.auto_resume:
                save_resume_state(args, 0, current_code, conversation_history, current_seed, 
                                asset_tree, selected_assets, asset_info)
        
        # Record content video and capture console logs
        temp_video_path = os.path.join(args.output_dir, "temp_content_0.mp4")
        temp_audio_path = os.path.splitext(temp_video_path)[0] + ".wav" if args.enable_audio else None
        temp_video_path, console_logs_path, error_free_console_log = record_content(
            temp_html_path,
            temp_video_path,
            args.video_duration,
            args.video_fps,
            enable_audio=args.enable_audio,
            sampling_rate=args.sampling_rate,
            display_num=args.display_num,
            monitor_source=args.monitor_source,
            server_root=args.server_root,
            chromium_path=args.chromium_path,
        )

        # Iterative improvement loop
        for i in range(start_iteration, args.max_iterations):
            print(f"\nIteration {i+1}")
            
            # Variables for video path and evaluator feedback
            evaluator_feedback = None

            # Get feedback using the fixed _generate_single_feedback function
            if args.use_separate_evaluator and args.best_of_k == 1:
                print("Getting feedback from evaluator model...")
                
                # Use the fixed _generate_single_feedback function for single feedback
                evaluator_feedback, current_seed = _generate_single_feedback(
                    0, args, system_prompt_feedback, evaluator_model, 
                    [temp_video_path], [temp_audio_path], args.content_description, 
                    [console_logs_path], current_seed, 
                    current_codes=[current_code], 
                    asset_info=asset_info,
                    system_prompt=system_prompt,
                    coding_model=coding_model
                )
                feedback_path = os.path.join(args.output_dir, f"feedback_iteration_{i+1}.txt")
                with open(feedback_path, "w", encoding="utf-8") as f:
                    f.write(evaluator_feedback)
            elif not args.use_separate_evaluator:
                evaluator_feedback = ""
            
            # Generate improvements (with best-of-k if enabled)
            if args.best_of_k > 1 or args.compare_to_current_code:
                print(f"Using best-of-{args.best_of_k} for improvement generation...")
                
                # Prepare data for k candidates
                current_codes = [current_code] * args.best_of_k
                video_paths = [temp_video_path] * args.best_of_k
                console_logs_paths = [console_logs_path] * args.best_of_k
                conversation_histories = [conversation_history] * args.best_of_k
                
                # Generate k feedback if using separate evaluator
                if args.use_separate_evaluator:
                    # Generate k feedback versions in parallel
                    audio_paths = [temp_audio_path] * args.best_of_k if args.enable_audio else [None] * args.best_of_k
                    feedbacks, current_seed = generate_k_feedback(
                        args, system_prompt_feedback, evaluator_model, video_paths, audio_paths,
                        args.content_description, console_logs_paths, current_seed, args.best_of_k,
                        current_codes=current_codes,
                        asset_info=asset_info,
                        system_prompt=system_prompt,
                        coding_model=coding_model
                    )
                    for j, feedback in enumerate(feedbacks):
                        feedback_path = os.path.join(args.output_dir, f"feedback_iteration_{i+1}_{j}.txt")
                        with open(feedback_path, "w", encoding="utf-8") as f:
                            f.write(feedback)
                else:
                    # Use the same feedback for all candidates
                    feedbacks = [evaluator_feedback] * args.best_of_k
                
                # Generate k improvement candidates
                improvement_candidates, current_seed = generate_k_improvements(
                    args, system_prompt, coding_model, current_codes, video_paths,
                    args.content_description, conversation_histories, feedbacks, console_logs_paths,
                    asset_info, current_seed, i, memories, args.best_of_k
                )
                                
                # Select the best candidate
                best_idx, current_seed = select_best_candidate(
                    args, improvement_candidates, current_seed, i+1,
                    system_prompt=system_prompt, system_prompt_eval=system_prompt_eval,
                    coding_model=coding_model, evaluator_model=evaluator_model, 
                    current_code=current_code if args.compare_to_current_code else None,
                    best_of_k_no_console_error=args.best_of_k_no_console_error
                )
                
                # Handle the result
                if best_idx == -1:
                    # Current code was selected as best, no change needed
                    print("Current code selected as best, no improvement applied")
                    success_code = False  # No change made
                    notes = None
                    improved_code_raw = ""  # No raw code since no change was made
                    # Keep current conversation_history unchanged
                else:
                    # Use the best candidate
                    current_code, conversation_history, notes, improved_code_raw = improvement_candidates[best_idx]
                    success_code = True  # Assume success since we generated candidates
                    print(f"Selected improvement candidate {best_idx} as the best")
            else:
                # Standard single generation
                # Use _generate_single_improvement to ensure technical feedback is properly handled
                improvement_result = _generate_single_improvement(
                    0, args, system_prompt, coding_model, 
                    [current_code], [temp_video_path if not args.without_video_feedback else None],
                    args.content_description, [None], 
                    [evaluator_feedback], [console_logs_path], asset_info, current_seed, i, memories
                )
                
                current_code, conversation_history, notes, improved_code_raw = improvement_result
                success_code = True  # Assume success since we generated a candidate
                current_seed += 1  # Increment seed for next generation
                print(improved_code_raw)
                            
            # Extract and store notes if memory is enabled
            if notes:
                memories = add_memory_entry(memories, i+1, notes, args.memory_len)
                # Save updated memory state
                save_memory_state(args.output_dir, memories)


            # Save improved content to temporary file
            with open(temp_html_path, "w", encoding="utf-8") as f:
                f.write(current_code)

            # Save temp HTML file with iteration number for resume functionality
            with open(os.path.join(args.output_dir, f"temp_content_{i+1}.html"), "w", encoding="utf-8") as f:
                f.write(current_code)

            # Record new video for next iteration's feedback
            temp_video_path = os.path.join(args.output_dir, f"temp_content_{i+1}.mp4")
            temp_audio_path = os.path.splitext(temp_video_path)[0] + ".wav" if args.enable_audio else None
            temp_video_path, console_logs_path, error_free_console_log = record_content(
                temp_html_path,
                temp_video_path,
                args.video_duration,
                args.video_fps,
                enable_audio=args.enable_audio,
                sampling_rate=args.sampling_rate,
                display_num=args.display_num,
                monitor_source=args.monitor_source,
                server_root=args.server_root,
                chromium_path=args.chromium_path,
            )

            # early exit only if: model ask for it, error-free console, and unmodified code
            if args.early_exit and "EARLY_EXIT: TRUE" in improved_code_raw and error_free_console_log and not success_code:
                print(f"\nModel requested early exit")
                print(f"Stopping after {i+1} iterations")
                break
        
            # stop if error-free console and did at least min_iterations
            if error_free_console_log and i+1 >= args.min_iterations: # there is no console bug and we did enough steps, we stop there
                break

            # Save resume state after each iteration
            if args.auto_resume:
                save_resume_state(args, i+1, current_code, conversation_history, current_seed, 
                                asset_tree, selected_assets, asset_info)

            # Save iteration content
            iteration_html_path = os.path.join(args.output_dir, f"content_iteration_{i+1}.html")
            with open(iteration_html_path, "w", encoding="utf-8") as f:
                f.write(current_code)
        
        ########## Endgame ##########

        # Save final content
        final_html_path = os.path.join(args.output_dir, "final_content.html")
        with open(final_html_path, "w", encoding="utf-8") as f:
            f.write(current_code)
        
        # Record final content video and capture console logs
        final_video_path = os.path.join(args.output_dir, "final_content.mp4")
        final_audio_path = os.path.splitext(final_video_path)[0] + ".wav" if args.enable_audio else None
        final_video_path, final_console_logs_path, error_free_console_log = record_content(
            final_html_path,
            final_video_path,
            args.video_duration,
            args.video_fps,
            enable_audio=args.enable_audio,
            sampling_rate=args.sampling_rate,
            display_num=args.display_num,
            monitor_source=args.monitor_source,
            server_root=args.server_root,
            chromium_path=args.chromium_path,
        )
        
    finally:
        pass

if __name__ == "__main__":
    main()
