import os
import random
import json
import re
import time
import cv2
import base64
import numpy as np
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any


class GPTPhysicsAnalyzer:
    """
    Comprehensive GPT Physics Property Analyzer
    Supports single-value prediction and relative comparison for friction, viscosity, and elasticity
    """
    
    def __init__(self, config_path: str = "vjepa_data_dirs.json"):
        """
        Initialize analyzer
        
        Args:
            config_path: path to vjepa data directory configuration file
        """
        # If config_path is relative, make it relative to current script directory
        if not os.path.isabs(config_path):
            script_dir = os.path.dirname(os.path.abspath(__file__))
            config_path = os.path.join(script_dir, config_path)
        
        self.config_path = config_path
        self.load_config()
        
        # Supported physical properties and analysis types
        self.properties = ["friction", "viscosity", "elasticity"]
        self.analysis_types = ["absolute", "relative"]
        
        # Mapping between dataset classes and GT files
        self.dataset_gt_mapping = {
            "WebVidv2Viscosity": "bounce_analysis_results.json",      # v119 viscosity absolute, v120 friction absolute, v124 elasticity absolute
            "WebVidv3ViscosityTest": "bounce_analysis_results.json",  # v110 viscosity relative - actually has GT file!
            "WebVidv3FrictionTest": "bounce_analysis_results.json",   # v121 friction relative - actually has GT file!  
            "WebVidv3ElasticityTest": "gt_retitution1.json"           # v125 elasticity relative
        }
        
        # Dataset classes corresponding to different versions
        self.version_dataset_mapping = {
            "v119_vjepa": "WebVidv2Viscosity",      # viscosity absolute
            "v110_vjepa": "WebVidv3ViscosityTest",  # viscosity relative
            "v120_vjepa": "WebVidv2Viscosity",      # friction absolute  
            "v121_vjepa": "WebVidv3FrictionTest",   # friction relative
            "v124_vjepa": "WebVidv2Viscosity",      # elasticity absolute
            "v125_vjepa": "WebVidv3ElasticityTest"  # elasticity relative
        }
        
        self.version_gt_override = {
            "v124_vjepa": "gt_retitution1.json"
        }
    
    def load_config(self):
        """Load vjepa data directory configuration"""
        with open(self.config_path, 'r') as f:
            self.config = json.load(f)
        print(f"Configuration loaded: {len(self.config)} test versions")
    
    def get_gt_data(self, data_dir: str, dataset_class: str, property_type: str, version: str = None) -> Optional[Dict]:
        """
        Load corresponding GT data based on dataset type
        
        Args:
            data_dir: data directory path
            dataset_class: dataset class name
            property_type: physical property type
            version: version name (for special case handling)
            
        Returns:
            GT data dictionary or None
        """
        # Check if there's a version-specific override
        if version and version in self.version_gt_override:
            gt_file = self.version_gt_override[version]
        else:
            gt_file = self.dataset_gt_mapping.get(dataset_class)
        
        if gt_file:
            gt_path = os.path.join(data_dir, gt_file)
            if os.path.exists(gt_path):
                with open(gt_path, 'r') as f:
                    return json.load(f)
            else:
                print(f"Warning: GT file does not exist - {gt_path}")
                return None
        else:
            # For WebVidv3 series, parse property values from filename
            print(f"Using filename parsing for GT data, dataset class: {dataset_class}")
            return {}
    
    def extract_property_from_filename(self, filename: str, property_type: str, dataset_class: str) -> float:
        """
        Extract physical property value from filename
        
        Args:
            filename: video filename
            property_type: physical property type
            dataset_class: dataset class name
            
        Returns:
            extracted property value
        """
        basename = filename.replace('_rgb.mp4', '')
        parts = basename.split('_')
        
        try:
            # For WebVidv3 series, extract uniformly from position after objFriction
            if dataset_class in ["WebVidv3ViscosityTest", "WebVidv3FrictionTest"]:
                # Find the position of "objFriction"
                if "objFriction" in parts:
                    obj_friction_index = parts.index("objFriction")
                    if obj_friction_index + 1 < len(parts):
                        return float(parts[obj_friction_index + 1])
                
                # If objFriction is not found, try other possible positions
                for i, part in enumerate(parts):
                    # Try to parse possible numeric values
                    try:
                        val = float(part)
                        # Filter out numbers that are obviously not property values (like id, version numbers, etc.)
                        if val > 0 and val not in [0, 1, 2] and '.' in part:
                            return val
                    except ValueError:
                        continue
                        
                return 0.0
            
            # For WebVidv2 series friction tests, also extract from after objFriction
            elif dataset_class == "WebVidv2Viscosity" and property_type == "friction":
                # Find the position of "objFriction"
                if "objFriction" in parts:
                    obj_friction_index = parts.index("objFriction")
                    if obj_friction_index + 1 < len(parts):
                        return float(parts[obj_friction_index + 1])
                        
                return 0.0
            else:
                return 0.0
        except (IndexError, ValueError):
            return 0.0
    
    def sample_icl_examples(self, train_dir: str, property_type: str, analysis_type: str, dataset_class: str, num_examples: int = None, frame_size: Optional[Tuple[int, int]] = None) -> List[Dict]:
        """
        Sample ICL examples from training set
        
        Args:
            train_dir: training set directory path
            property_type: physical property type
            analysis_type: analysis type (absolute/relative)
            dataset_class: dataset class name
            num_examples: number of examples (relative comparison tests recommend using 1-2 examples to avoid API limits)
            frame_size: frame size
            
        Returns:
            ICL example list, each example contains input and output
        """
        if not os.path.exists(train_dir):
            print(f"Warning: training set directory does not exist - {train_dir}")
            return []
        
        # Load training set GT data
        gt_data = self.get_gt_data(train_dir, dataset_class, property_type)
        
        if not gt_data:
            print(f"Warning: unable to load GT data - {train_dir}")
            return []
        
        # Only sample from video files corresponding to keys existing in GT file
        available_videos = []
        for basename in gt_data.keys():
            video_file = basename + '_rgb.mp4'
            video_path = os.path.join(train_dir, video_file)
            if os.path.exists(video_path):
                available_videos.append(video_file)
        
        available_videos.sort()
        print(f"Found {len(available_videos)} available video files from GT data (total GT keys: {len(gt_data)})")
        
        # Set default number of examples
        if num_examples is None:
            if analysis_type == "relative":
                num_examples = 1  # Relative comparison tests default to 1 example
            else:
                num_examples = 1  # Absolute value tests default to 1 example (to avoid API limits)
        
        icl_examples = []
        
        if analysis_type == "absolute":
            # Absolute value tests: sample from available videos
            if len(available_videos) == 0:
                print("Warning: No valid video files found for ICL sampling")
                return []
            
            # Deterministic sampling: uniform distribution selection
            sample_size = min(len(available_videos), num_examples)
            if sample_size == len(available_videos):
                selected_videos = available_videos
            else:
                step = len(available_videos) / sample_size
                indices = [int(i * step) for i in range(sample_size)]
                selected_videos = [available_videos[i] for i in indices]
            
            for video_file in selected_videos:
                try:
                    # Get GT value (should all have valid values since selected from GT data)
                    gt_value = self.get_gt_value(video_file, gt_data, dataset_class, property_type)
                    
                    # Double-check GT value validity
                    if gt_value == 0.0:
                        print(f"Unexpected: Video with GT value 0: {video_file} (GT=0.0)")
                        continue
                    
                    # Read video frames
                    video_path = os.path.join(train_dir, video_file)
                    frames = self.read_video_frames(video_path, target_frames=16, frame_size=frame_size)
                    
                    # Build absolute value test output format
                    if property_type == "friction":
                        output = f"Estimated friction coefficient value: {gt_value:.3f}"
                    elif property_type == "viscosity":
                        output = f"Estimated viscosity value: {gt_value:.1f} cP"
                    elif property_type == "elasticity":
                        output = f"Estimated restitution coefficient value: {gt_value:.3f}"
                    else:
                        output = f"Estimated {property_type} value: {gt_value:.3f}"
                    
                    icl_examples.append({
                        'video_file': video_file,
                        'frames': frames,
                        'gt_value': gt_value,
                        'output': output,
                        'type': 'absolute'
                    })
                    
                    print(f"Successfully sampled ICL example: {video_file} (GT={gt_value:.3f})")
                    
                except Exception as e:
                    print(f"Warning: Absolute value ICL example sampling failed - {video_file}: {str(e)}")
                    continue
            
            if len(icl_examples) < num_examples:
                print(f"Warning: Only found {len(icl_examples)}/{num_examples} valid ICL examples")
        
        else:  # relative comparison
            # Relative comparison test: sample paired videos
            print(f"Starting to sample ICL examples for relative comparison tests...")
            
            # For relative comparison tests, recommend using fewer examples to avoid API limits
            if num_examples > 2:
                print(f"Warning: Relative comparison tests recommend using 1-2 examples to avoid API limits, currently set to {num_examples}")
                print(f"Auto-adjusting to 1 example...")
                num_examples = 1
            
            # First group by video_x to find paired videos
            video_groups = {}
            for video_file in available_videos:
                if 'video_' in video_file and '_id_' in video_file:
                    parts = video_file.split('_')
                    
                    # Look for video_x_id_y pattern
                    for i, part in enumerate(parts):
                        if part == 'video' and i + 2 < len(parts) and parts[i + 2] == 'id':
                            video_num = f"video_{parts[i + 1]}"  # video_x
                            id_part = parts[i + 3]  # id_y
                            
                            if video_num not in video_groups:
                                video_groups[video_num] = []
                            video_groups[video_num].append((video_file, id_part))
                            break
            
            print(f"Found {len(video_groups)} video groups for ICL sampling")
            
            # Select paired videos from video groups as ICL examples
            sampled_pairs = 0
            video_group_list = list(video_groups.items())
            
            # Ensure at least enough groups to generate examples
            if len(video_group_list) < num_examples:
                print(f"Warning: insufficient video groups - {len(video_group_list)} < {num_examples}")
                num_examples = len(video_group_list)
            
            if num_examples == len(video_group_list):
                selected_groups = video_group_list
            else:
                step = len(video_group_list) / num_examples
                indices = [int(i * step) for i in range(num_examples)]
                selected_groups = [video_group_list[i] for i in indices]
            
            for video_num, videos in selected_groups:
                if sampled_pairs >= num_examples:
                    break
                    
                # Sort by id
                videos.sort(key=lambda x: x[1])
                
                # Find id_0 and id_1 as a pair (if not available, select first two)
                if len(videos) >= 2:
                    video_pair = videos[:2]  # Select first two videos
                    video_file1, id1 = video_pair[0]
                    video_file2, id2 = video_pair[1]
                    
                    try:
                        # Get GT values for both videos, check validity first
                        gt_value1 = self.get_gt_value(video_file1, gt_data, dataset_class, property_type)
                        gt_value2 = self.get_gt_value(video_file2, gt_data, dataset_class, property_type)
                        
                        # Skip video pairs with invalid GT values
                        if gt_value1 == 0.0 or gt_value2 == 0.0:
                            print(f"Skipping video pair with invalid GT values: {video_file1} (GT={gt_value1:.3f}) vs {video_file2} (GT={gt_value2:.3f})")
                            continue
                        
                        # Read frames from both videos
                        video_path1 = os.path.join(train_dir, video_file1)
                        video_path2 = os.path.join(train_dir, video_file2)
                        frames1 = self.read_video_frames(video_path1, target_frames=16, frame_size=frame_size)
                        frames2 = self.read_video_frames(video_path2, target_frames=16, frame_size=frame_size)
                        
                        # Calculate comparison result based on real GT values (using high confidence)
                        if gt_value1 > gt_value2:
                            gt_comparison = 1
                            confidence_score = 0.9  # High confidence favoring VIDEO 1
                        elif gt_value2 > gt_value1:
                            gt_comparison = 0
                            confidence_score = 0.1  # High confidence favoring VIDEO 2
                        else:
                            gt_comparison = 0.5
                            confidence_score = 0.5  # Equal case
                        
                        # Build confidence output format for relative comparison
                        output = f"Comparison result: {confidence_score}"
                        
                        icl_examples.append({
                            'video_file1': video_file1,
                            'video_file2': video_file2,
                            'frames1': frames1,
                            'frames2': frames2,
                            'gt_value1': gt_value1,
                            'gt_value2': gt_value2,
                            'gt_comparison': gt_comparison,
                            'output': output,
                            'type': 'relative'
                        })
                        
                        sampled_pairs += 1
                        print(f"Successfully sampled relative comparison ICL example {sampled_pairs}: {video_file1} vs {video_file2} (GT: {gt_value1:.3f} vs {gt_value2:.3f} → {gt_comparison}, confidence: {confidence_score})")
                        
                    except Exception as e:
                        print(f"Warning: relative comparison ICL example sampling failed - {video_file1} vs {video_file2}: {str(e)}")
                        continue
                else:
                    print(f"Warning: group {video_num} has insufficient videos to form pairs")
            
            # If no successful video group pairs, try direct video pairing
            if len(icl_examples) == 0 and len(available_videos) >= 2:
                print("Trying direct pairing from available videos...")
                num_pairs = min(num_examples, len(available_videos) // 2)
                
                # Direct pairing from available videos
                for i in range(num_pairs):
                    if len(icl_examples) >= num_examples:
                        break
                        
                    video_file1 = available_videos[i * 2]
                    video_file2 = available_videos[i * 2 + 1]
                    
                    try:
                        # Get GT values for both videos (should both be valid)
                        gt_value1 = self.get_gt_value(video_file1, gt_data, dataset_class, property_type)
                        gt_value2 = self.get_gt_value(video_file2, gt_data, dataset_class, property_type)
                        
                        # Double check GT value validity
                        if gt_value1 == 0.0 or gt_value2 == 0.0:
                            print(f"Unexpected: video pair with invalid GT values: {video_file1} (GT={gt_value1:.3f}) vs {video_file2} (GT={gt_value2:.3f})")
                            continue
                        
                        # Read frames from both videos
                        video_path1 = os.path.join(train_dir, video_file1)
                        video_path2 = os.path.join(train_dir, video_file2)
                        frames1 = self.read_video_frames(video_path1, target_frames=16, frame_size=frame_size)
                        frames2 = self.read_video_frames(video_path2, target_frames=16, frame_size=frame_size)
                        
                        # Calculate comparison result based on real GT values (using high confidence)
                        if gt_value1 > gt_value2:
                            gt_comparison = 1
                            confidence_score = 0.9  # High confidence favoring VIDEO 1
                        elif gt_value2 > gt_value1:
                            gt_comparison = 0
                            confidence_score = 0.1  # High confidence favoring VIDEO 2
                        else:
                            gt_comparison = 0.5
                            confidence_score = 0.5  # Equal case
                        
                        # Build confidence output format for relative comparison
                        output = f"Comparison result: {confidence_score}"
                        
                        icl_examples.append({
                            'video_file1': video_file1,
                            'video_file2': video_file2,
                            'frames1': frames1,
                            'frames2': frames2,
                            'gt_value1': gt_value1,
                            'gt_value2': gt_value2,
                            'gt_comparison': gt_comparison,
                            'output': output,
                            'type': 'relative'
                        })
                        
                        print(f"Successfully sampled direct pairing ICL example {i+1}: {video_file1} vs {video_file2} (GT: {gt_value1:.3f} vs {gt_value2:.3f} → {gt_comparison}, confidence: {confidence_score})")
                        
                    except Exception as e:
                        print(f"Warning: direct pairing ICL example sampling failed - {video_file1} vs {video_file2}: {str(e)}")
                        continue
        
        print(f"Successfully sampled {len(icl_examples)} ICL examples (type: {analysis_type})")
        return icl_examples

    def sample_videos(self, data_dir: str, max_samples: int = 50) -> List[str]:
        """
        Deterministically sample video files from data directory (uniform sampling after sorting by filename)
        
        Args:
            data_dir: Data directory path
            max_samples: Maximum number of samples
            
        Returns:
            List of sampled video filenames
        """
        if not os.path.exists(data_dir):
            print(f"Warning: data directory does not exist - {data_dir}")
            return []
        
        # Get all rgb.mp4 files and sort by filename
        all_videos = [f for f in os.listdir(data_dir) if f.endswith('_rgb.mp4')]
        all_videos.sort()  # Sort by filename to ensure consistent results across runs
        
        if len(all_videos) == 0:
            print(f"Warning: no rgb.mp4 files found in directory - {data_dir}")
            return []
        
        # Deterministic sampling: uniform distribution selection
        sample_size = min(len(all_videos), max_samples)
        if sample_size == len(all_videos):
            # If not enough samples, return all
            sampled_videos = all_videos
        else:
            # Uniform sampling
            step = len(all_videos) / sample_size
            indices = [int(i * step) for i in range(sample_size)]
            sampled_videos = [all_videos[i] for i in indices]
        
        print(f"Deterministically sampled {sample_size}/{len(all_videos)} videos from {data_dir}")
        return sampled_videos
    
    def read_video_frames(self, video_path: str, target_frames: int = 16, frame_size: Optional[Tuple[int, int]] = None) -> List[str]:
        """
        Read video file and convert frames to base64 encoding, intelligent sampling to reach target frame count
        
        Args:
            video_path: Video file path
            target_frames: Target number of frames
            frame_size: Tuple for resizing frames (width, height)
            
        Returns:
            List of base64 encoded video frames
        """
        video = cv2.VideoCapture(video_path)
        if not video.isOpened():
            raise ValueError(f"Cannot open video file: {video_path}")
        
        # Read all frames first
        all_frames = []
        frame_count = 0
        
        while video.isOpened():
            success, frame = video.read()
            if not success:
                break
                
            # Resize frame if frame_size is specified
            if frame_size is not None:
                frame = cv2.resize(frame, frame_size)
                
            _, buffer = cv2.imencode(".jpg", frame)
            all_frames.append(base64.b64encode(buffer).decode("utf-8"))
            frame_count += 1

        video.release()
        
        # Intelligent sampling strategy
        if frame_count <= target_frames:
            # If video has insufficient frames, keep all
            selected_frames = all_frames
            print(f"Video {os.path.basename(video_path)}: total frames {frame_count}, keeping all {len(selected_frames)} frames")
        else:
            # If video has enough frames, uniform sampling
            indices = [int(i * (frame_count - 1) / (target_frames - 1)) for i in range(target_frames)]
            selected_frames = [all_frames[i] for i in indices]
            print(f"Video {os.path.basename(video_path)}: total frames {frame_count}, uniform sampling {len(selected_frames)} frames")
        
        return selected_frames
    
    def build_icl_content(self, icl_examples: List[Dict], frame_index: bool = False, concat_mode: bool = False) -> List[Dict]:
        """
        Build ICL example content section
        
        Args:
            icl_examples: List of ICL examples
            frame_index: Whether to enable frame indexing
            concat_mode: Whether to use concatenation mode
            
        Returns:
            List of ICL content
        """
        icl_content = []
        
        for i, example in enumerate(icl_examples, 1):
            # Add example title
            icl_content.append({"type": "text", "text": f"Example {i}:"})
            
            if example.get('type') == 'absolute':
                # Absolute value test: add frames from single video
                if frame_index:
                    for j, frame in enumerate(example['frames'], 1):
                        icl_content.append({"type": "text", "text": f"frame{j}:"})
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                else:
                    for frame in example['frames']:
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
            
            elif example.get('type') == 'relative':
                # Relative comparison test: add frames from two videos
                if concat_mode:
                    # Concatenation mode: video1 + black frames + video2, no additional labels
                    # Generate black frames (using default size)
                    black_frames = self.generate_black_frames(num_frames=3, frame_size=(640, 480))
                    
                    if frame_index:
                        # Concatenation mode + frame index: add all frames in order
                        frame_count = 1
                        
                        # Add video1 frames
                        for frame in example['frames1']:
                            icl_content.append({"type": "text", "text": f"frame{frame_count}:"})
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                            frame_count += 1
                        
                        # Add black frames (not counted in frame index)
                        for black_frame in black_frames:
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{black_frame}"}})
                        
                        # Add video2 frames
                        for frame in example['frames2']:
                            icl_content.append({"type": "text", "text": f"frame{frame_count}:"})
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                            frame_count += 1
                    else:
                        # Concatenation mode, no frame index: direct concatenation
                        for frame in example['frames1']:
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                        
                        for black_frame in black_frames:
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{black_frame}"}})
                        
                        for frame in example['frames2']:
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                else:
                    # Traditional mode: separately identify two videos
                    icl_content.append({"type": "text", "text": "VIDEO 1 frames:"})
                    
                    if frame_index:
                        for j, frame in enumerate(example['frames1'], 1):
                            icl_content.append({"type": "text", "text": f"frame{j}:"})
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                    else:
                        for frame in example['frames1']:
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                    
                    icl_content.append({"type": "text", "text": "VIDEO 2 frames:"})
                    
                    if frame_index:
                        for j, frame in enumerate(example['frames2'], 1):
                            icl_content.append({"type": "text", "text": f"frame{j}:"})
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                    else:
                        for frame in example['frames2']:
                            icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
            
            # Compatible with old format (no type field) - need to determine if absolute or relative comparison test
            if 'frames1' in example and 'frames2' in example:
                # ICL examples for relative comparison test
                icl_content.append({"type": "text", "text": "VIDEO 1 frames:"})
                
                if frame_index:
                    for j, frame in enumerate(example['frames1'], 1):
                        icl_content.append({"type": "text", "text": f"frame{j}:"})
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                else:
                    for frame in example['frames1']:
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                
                icl_content.append({"type": "text", "text": "VIDEO 2 frames:"})
                
                if frame_index:
                    for j, frame in enumerate(example['frames2'], 1):
                        icl_content.append({"type": "text", "text": f"frame{j}:"})
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                else:
                    for frame in example['frames2']:
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
            else:
                # ICL examples for absolute value test
                if frame_index:
                    for j, frame in enumerate(example["frames"], 1):
                        icl_content.append({"type": "text", "text": f"frame{j}:"})
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                else:
                    for frame in example['frames']:
                        icl_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
            
            # Add output example
            icl_content.append({"type": "text", "text": f"Output: {example['output']}"})
            icl_content.append({"type": "text", "text": ""})  # Empty line separator
        
        # Add current task identifier
        icl_content.append({"type": "text", "text": "Now analyze the following video:"})
        
        return icl_content

    def get_analysis_prompt(self, property_type: str, analysis_type: str, icl_examples: Optional[List[Dict]] = None, follow_oracle_test: bool = False) -> str:
        """
        Get analysis prompt
        
        Args:
            property_type: Physical property type (friction/viscosity/elasticity)
            analysis_type: Analysis type (absolute/relative)
            icl_examples: List of ICL examples (if provided, the prompt will be modified to fit the ICL format)
            follow_oracle_test: Whether to include problem-solving step guidance
            
        Returns:
            Prompt string
        """
        # Adjust prompt based on whether ICL examples are available
        icl_prefix = ""
        if icl_examples:
            icl_prefix = "Based on the examples above, "
        
        # Define problem-solving step guidance content
        oracle_guidance = ""
        if follow_oracle_test:
            oracle_guidance_dict = {
                "elasticity": """
ANALYSIS METHOD:
The intrinsic visual cue to reflect the elasticity property is the dropping height and bouncing height of the ball. You only need to calculate the ratio of the height difference in the camera coordinate. From this trajectory, you need to identify key points: the initial position, first ground contact, and bounce peak.

Follow these steps:
1. Identify the ball's initial dropping position (highest point before impact)
2. Locate the first ground contact point (lowest point during impact)
3. Find the bounce peak (highest point after first bounce)
4. Calculate the height ratio: (bounce height - ground level) / (initial height - ground level)
5. This ratio represents the restitution coefficient (elasticity)

""",
                "viscosity": """
ANALYSIS METHOD:
The intrinsic visual cue to reflect the viscosity property is the speed that the liquid expands on the ground. You can calculate the area size in the camera plane and then normalise the area sizes by the first area size when the liquid touches ground; the growth speed of the normalised area size sequence reflects the viscosity.

Follow these steps:
1. Identify when the liquid first touches the ground (initial contact frame)
2. Measure the liquid area size in each subsequent frame
3. Normalize all area sizes by dividing by the initial contact area
4. Calculate the growth rate of the normalized area sequence over time
5. Higher viscosity = slower area expansion, lower viscosity = faster area expansion

""",
                "friction": """
ANALYSIS METHOD:
The intrinsic visual cue to reflect the friction property is the acceleration of the sliding object, i.e., how fast is the velocity of the object reduced because of friction and can be inferred by the position of the object over time. You can estimate a homography transformation from the 4 corner points of the top surface of the object. You then use the homography transformation to project the object upper plane to bird eye view, and estimate the acceleration a via fitting a parabola ax² + bx + c to the position of the object over time.

Follow these steps:
1. Identify the 4 corner points of the object's top surface in each frame
2. Estimate homography transformation to convert to bird's eye view
3. Track the object's position over time in the transformed coordinate system
4. Fit a parabolic function (ax² + bx + c) to the position-time data
5. Extract the acceleration coefficient 'a' which reflects the friction level
6. Higher friction = larger deceleration (more negative 'a'), lower friction = smaller deceleration

"""
            }
            oracle_guidance = oracle_guidance_dict.get(property_type, "")
        
        if analysis_type == "absolute":
            if property_type == "friction":
                return f"""
                You are analyzing frames from a video showing object motion with friction. Your task is to estimate the coefficient of friction.
                
                IMPORTANT: You CAN analyze these images. You MUST provide a specific friction coefficient estimate.
                
                {oracle_guidance}The coefficient of friction is typically between 0 and 1:
                - Very smooth surfaces (ice on ice): 0.02-0.03
                - Steel on steel: 0.4-0.6  
                - Rubber on concrete: 0.6-0.8
                - Wood on wood: 0.3-0.5
                
                {icl_prefix}analyze the object's deceleration, sliding behavior, and surface interaction.
                
                YOUR RESPONSE MUST END WITH A SPECIFIC FRICTION COEFFICIENT VALUE IN THIS FORMAT:
                "Estimated friction coefficient value: X"
                
                Do not say you cannot analyze the images. You must provide your best estimate based on the visual information.
                """
                
            elif property_type == "viscosity":
                return f"""
                You are analyzing frames from a video showing liquid motion. Your task is to estimate the viscosity of this liquid.
                
                IMPORTANT: You CAN analyze these images. You MUST provide a specific viscosity estimate.
                
                {oracle_guidance}For reference:
                - Water: 1 cP
                - Vegetable oil: 100 cP
                - Motor oil: 300 cP
                - Honey: 5000 cP
                
                {icl_prefix}analyze the liquid's flow speed, surface waves, and interaction with objects.
                
                YOUR RESPONSE MUST END WITH A SPECIFIC VISCOSITY VALUE IN THIS FORMAT:
                "Estimated viscosity value: X cP"
                
                Do not say you cannot analyze the images. You must provide your best estimate based on the visual information.
                """
                
            elif property_type == "elasticity":
                return f"""
                You are analyzing frames from a video showing a bouncing object. Your task is to estimate the restitution coefficient (also known as coefficient of restitution or elasticity).
                
                IMPORTANT: You CAN analyze these images. You MUST provide a specific restitution coefficient estimate.
                
                {oracle_guidance}The restitution coefficient is the ratio of separation speed to approach speed during collision:
                - Perfectly elastic collision: 1.0 (ball bounces back with same speed)
                - Perfectly inelastic collision: 0.0 (ball sticks to surface, no bounce)
                - Typical values: 0.1-0.9 for most materials
                
                {icl_prefix}analyze the object's behavior:
                1. Look for the object falling towards a surface
                2. Observe the impact moment
                3. Check if and how the object bounces back
                4. Compare the approach speed vs. separation speed
                
                If the object bounces back significantly → restitution coefficient closer to 1.0
                If the object barely bounces or sticks → restitution coefficient closer to 0.0
                
                YOUR RESPONSE MUST END WITH A SPECIFIC RESTITUTION COEFFICIENT VALUE IN THIS FORMAT:
                "Estimated restitution coefficient value: X"
                
                Where X is a number between 0 and 1 (e.g., 0.7, 0.3, 0.85).
                
                Do not say you cannot analyze the images. You must provide your best estimate based on the visual information.
                """
        
        elif analysis_type == "relative":  # relative comparison (original mode)
            return f"""
            You are comparing two {property_type} videos to determine which has a higher {property_type} value.
            
            IMPORTANT: You CAN analyze these images. You MUST provide a confidence score between 0 and 1.
            
            {oracle_guidance}The first set of frames is labeled "VIDEO 1" and the second set is labeled "VIDEO 2".
            
            {icl_prefix}compare the {property_type} characteristics between the two videos:
            - VIDEO 1 frames are shown first
            - VIDEO 2 frames are shown second
            
            YOUR RESPONSE MUST END WITH A SPECIFIC CONFIDENCE SCORE IN THIS FORMAT:
            "Comparison result: X"
            
            Where X is a decimal number between 0.0 and 1.0:
            - X = 1.0: Very confident that VIDEO 1 has higher {property_type}
            - X = 0.8: Quite confident that VIDEO 1 has higher {property_type}
            - X = 0.6: Somewhat confident that VIDEO 1 has higher {property_type}
            - X = 0.5: Equal or uncertain (no clear difference)
            - X = 0.4: Somewhat confident that VIDEO 2 has higher {property_type}
            - X = 0.2: Quite confident that VIDEO 2 has higher {property_type}
            - X = 0.0: Very confident that VIDEO 2 has higher {property_type}
            
            The closer to 1.0, the more confident you are that VIDEO 1 has higher {property_type}.
            The closer to 0.0, the more confident you are that VIDEO 2 has higher {property_type}.
            
            Do not say you cannot analyze the images. You must provide your best confidence score based on the visual information.
            """
        
        else:  # relative_concat - concatenated comparison mode
            return f"""
            You are comparing two {property_type} videos to determine which has a higher {property_type} value.
            
            IMPORTANT: You CAN analyze these images. You MUST provide a confidence score between 0 and 1.
            
            {oracle_guidance}The frames are arranged in sequence: VIDEO 1 frames first, followed by 3 BLACK SEPARATOR frames, then VIDEO 2 frames.
            
            {icl_prefix}compare the {property_type} characteristics between the two videos:
            - The frames BEFORE the black separator frames belong to VIDEO 1
            - The frames AFTER the black separator frames belong to VIDEO 2
            - The black frames are just separators and should be ignored for analysis
            
            Analyze the motion, physics, and {property_type} characteristics in both video segments.
            
            YOUR RESPONSE MUST END WITH A SPECIFIC CONFIDENCE SCORE IN THIS FORMAT:
            "Comparison result: X"
            
            Where X is a decimal number between 0.0 and 1.0:
            - X = 1.0: Very confident that VIDEO 1 has higher {property_type}
            - X = 0.8: Quite confident that VIDEO 1 has higher {property_type}
            - X = 0.6: Somewhat confident that VIDEO 1 has higher {property_type}
            - X = 0.5: Equal or uncertain (no clear difference)
            - X = 0.4: Somewhat confident that VIDEO 2 has higher {property_type}
            - X = 0.2: Quite confident that VIDEO 2 has higher {property_type}
            - X = 0.0: Very confident that VIDEO 2 has higher {property_type}
            
            The closer to 1.0, the more confident you are that VIDEO 1 has higher {property_type}.
            The closer to 0.0, the more confident you are that VIDEO 2 has higher {property_type}.
            
            Do not say you cannot analyze the images. You must provide your best confidence score based on the visual information.
            """
        
        return ""
    
    def analyze_single_video_with_gpt(self, video_path: str, property_type: str, max_frames: int = 16, frame_size: Optional[Tuple[int, int]] = None, model_version: str = "gpt-4o", frame_index: bool = False, icl_examples: Optional[List[Dict]] = None, follow_oracle_test: bool = False, max_retries: int = 5, retry_delay: int = 15, validation_retries: int = 3) -> str:
        """
        Use GPT to analyze physical properties of a single video (enhanced retry mechanism)
        
        Args:
            video_path: Video file path
            property_type: Physical property type
            max_frames: Maximum number of frames
            frame_size: Adjust frame size
            model_version: GPT model version
            frame_index: Whether to add an index identifier to each frame...
            
        Returns:
            GPT analysis result
        """
        # Read video frames, directly specify target frame count
        selected_frames = self.read_video_frames(video_path, target_frames=max_frames, frame_size=frame_size)
        
        # Get prompt
        prompt_text = self.get_analysis_prompt(property_type, "absolute", icl_examples, follow_oracle_test)
        
        # Build message content
        content = [{"type": "text", "text": prompt_text}]
        
        # Add ICL examples (if available)
        if icl_examples:
            icl_content = self.build_icl_content(icl_examples, frame_index, concat_mode=False)  # Absolute value test does not use concat_mode
            content.extend(icl_content)
        
        if frame_index:
            # Add frame index identifier
            for i, frame in enumerate(selected_frames, 1):
                content.append({"type": "text", "text": f"frame{i}:"})
                content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
        else:
            # Original method, directly add frames
            for frame in selected_frames:
                content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
        
        prompt_messages = [
            {
                "role": "system",
                "content": f"You are an expert in {property_type} analysis with the ability to analyze video frames and determine {property_type} values. You MUST provide a specific {property_type} estimate. Never say you cannot analyze the images."
            },
            {
                "role": "user",
                "content": content,
            },
        ]
        
        # Enhanced retry mechanism: first try API call retry, then try result validation retry
        best_result = None
        best_predicted_value = None
        for validation_attempt in range(validation_retries):
            print(f"🔄 Validation retry {validation_attempt + 1}/{validation_retries}")
            
            # API call retry loop
            for api_attempt in range(max_retries):
                try:
                    print(f" 📡 API call {api_attempt + 1}/{max_retries}")
                    
                    # Call third-party GPT API (internal retry mechanism already exists)
                    result = self.call_third_party_gpt_api(prompt_messages, model_version)
                    
                    # Check if result is empty or error message
                    if not result or len(result.strip()) == 0:
                        print(f" ⚠️ API returned empty result")
                        if api_attempt < max_retries - 1:
                            print(f" ⏳ Waiting {retry_delay} seconds before retrying API call...")
                            time.sleep(retry_delay)
                            continue
                        else:
                            break  # Break out of API retry loop, enter next round of validation retry
                    
                    # Check if it is API error message
                    error_keywords = [
                        "API call failed",
                        "call failed",
                        "Server is busy",
                        "Please try again later",
                        "All retries failed",
                        "GPT API call error",
                        "GPT Vision API call failed",
                        "chatanywhere_error",
                        "INTERNAL_SERVER_ERROR",
                        "Third-party API call failed"
                    ]
                    is_error_response = any(keyword in result for keyword in error_keywords)
                    if is_error_response:
                        print(f" ❌ Detected API error response")
                        if api_attempt < max_retries - 1:
                            print(f" ⏳ Waiting {retry_delay} seconds before retrying API call...")
                            time.sleep(retry_delay)
                            continue
                        else:
                            break  # Break out of API retry loop, enter next round of validation retry
                    
                    # Try to extract predicted value for validation
                    predicted_value = self.extract_predicted_value(result, property_type)
                    
                    if predicted_value is not None:
                        # Successfully extracted valid predicted value
                        print(f" ✅ Successfully obtained valid predicted value: {predicted_value}")
                        return result
                    else:
                        # Cannot extract predicted value, but keep result as backup
                        print(f" ⚠️ Unable to extract predicted value from response")
                        if best_result is None or len(result) > len(best_result):
                            best_result = result
                            
                        # If there are still API retry attempts, continue retrying
                        if api_attempt < max_retries - 1:
                            print(f" 🔄 Predicted value extraction failed, retrying API call...")
                            time.sleep(retry_delay)
                            continue
                        else:
                            # API retry attempts exhausted, break out of loop and enter next round of validation retry
                            break
                            
                except Exception as e:
                    print(f" ❌ API call exception: {e}")
                    if api_attempt < max_retries - 1:
                        print(f" ⏳ Waiting {retry_delay} seconds before retrying API call...")
                        time.sleep(retry_delay)
                        continue
                    else:
                        break
            
            # If this round of validation retry was not successful, wait and enter next round
            if validation_attempt < validation_retries - 1:
                wait_time = retry_delay * (validation_attempt + 1)
                print(f" ⏳ Validation retry failed, waiting {wait_time} seconds before next validation retry...")
                time.sleep(wait_time)
        
        # All retries failed, return the best result (if any)
        if best_result:
            print(f" ⚠️ All retries failed to extract valid predicted value, returning best response")
            return best_result
        else:
            print(f" ❌ All retries failed, returning empty string")
            return ""
    
    def generate_black_frames(self, num_frames: int = 3, frame_size: Tuple[int, int] = (640, 480)) -> List[str]:
        """
        Generate a specified number of black frames (base64 encoded)
        
        Args:
            num_frames: Number of black frames to generate
            frame_size: Frame size (width, height)
            
        Returns:
            List of base64 encoded black frames
        """
        black_frames = []
        
        # Create black image
        black_image = np.zeros((frame_size[1], frame_size[0], 3), dtype=np.uint8)
        
        for _ in range(num_frames):
            # Encode black image as JPEG format
            _, buffer = cv2.imencode(".jpg", black_image)
            black_frame_b64 = base64.b64encode(buffer).decode("utf-8")
            black_frames.append(black_frame_b64)
            
        return black_frames
    
    def compare_two_videos_with_gpt(self, video_path1: str, video_path2: str, property_type: str, max_frames_per_video: int = 16, frame_size: Optional[Tuple[int, int]] = None, model_version: str = "gpt-4o", concat_mode: bool = False, frame_index: bool = False, icl_examples: Optional[List[Dict]] = None, follow_oracle_test: bool = False, max_retries: int = 5, retry_delay: int = 15, validation_retries: int = 3) -> str:
        """
        Use GPT to compare physical properties of two videos (enhanced retry mechanism)
        
        Args:
            video_path1: path to video file 1
            video_path2: path to video file 2
            property_type: Physical property type
            max_frames_per_video: Maximum number of frames per video
            frame_size: Adjust frame size
            model_version: GPT model version
            concat_mode: Whether to use concatenation mode
            frame_index: Whether to add frame index identifier
            icl_examples: List of ICL examples
            follow_oracle_test: Whether to include problem-solving step guidance
            max_retries: Maximum number of API call retries
            retry_delay: Retry interval (seconds)
            validation_retries: Result validation retry count (re-call API if valid value cannot be extracted)
            
        Returns:
            GPT analysis result
        """
        # Read video frames for both videos
        frames1 = self.read_video_frames(video_path1, target_frames=max_frames_per_video, frame_size=frame_size)
        frames2 = self.read_video_frames(video_path2, target_frames=max_frames_per_video, frame_size=frame_size)
        
        # Get prompt based on concat mode
        if concat_mode:
            prompt_text = self.get_analysis_prompt(property_type, "relative_concat", icl_examples, follow_oracle_test)
        else:
            prompt_text = self.get_analysis_prompt(property_type, "relative", icl_examples, follow_oracle_test)
            
        # Build message content
        content = [{"type": "text", "text": prompt_text}]
        
        # Add ICL examples (if available)
        if icl_examples:
            icl_content = self.build_icl_content(icl_examples, frame_index, concat_mode)
            content.extend(icl_content)
        
        # Add frames of the videos to be analyzed
        if concat_mode:
            # Concatenation mode: video1 + black frames + video2
            black_frames = self.generate_black_frames(num_frames=3, frame_size=frame_size)
            
            if frame_index:
                # Add video1 frames with index
                frame_count = 1
                for frame in frames1:
                    content.append({"type": "text", "text": f"frame{frame_count}:"})
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                    frame_count += 1
                
                # Add black frames
                for black_frame in black_frames:
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{black_frame}"}})
                
                # Add video2 frames with index
                for frame in frames2:
                    content.append({"type": "text", "text": f"frame{frame_count}:"})
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                    frame_count += 1
            else:
                # Concatenation mode without index
                for frame in frames1:
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
                for black_frame in black_frames:
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{black_frame}"}})
                for frame in frames2:
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
        
        else:
            # Traditional mode: separately identify two videos
            content.append({"type": "text", "text": "VIDEO 1 frames:"})
            if frame_index:
                for i, frame in enumerate(frames1, 1):
                    content.append({"type": "text", "text": f"frame{i}:"})
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
            else:
                for frame in frames1:
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
            
            content.append({"type": "text", "text": "VIDEO 2 frames:"})
            if frame_index:
                for i, frame in enumerate(frames2, 1):
                    content.append({"type": "text", "text": f"frame{i}:"})
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
            else:
                for frame in frames2:
                    content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}})
        
        prompt_messages = [
            {
                "role": "system",
                "content": f"You are an expert in {property_type} analysis with the ability to analyze video frames and compare {property_type} values. You MUST provide a specific confidence score. Never say you cannot analyze the images."
            },
            {
                "role": "user",
                "content": content,
            },
        ]
        
        # Enhanced retry mechanism
        best_result = None
        for validation_attempt in range(validation_retries):
            print(f"🔄 Validation retry {validation_attempt + 1}/{validation_retries}")
            
            for api_attempt in range(max_retries):
                try:
                    print(f" 📡 API call {api_attempt + 1}/{max_retries}")
                    result = self.call_third_party_gpt_api(prompt_messages, model_version)
                    
                    if not result or len(result.strip()) == 0:
                        print(f" ⚠️ API returned empty result")
                        if api_attempt < max_retries - 1:
                            print(f" ⏳ Waiting {retry_delay} seconds before retrying API call...")
                            time.sleep(retry_delay)
                            continue
                        else:
                            break
                    
                    error_keywords = [
                        "API call failed",
                        "call failed",
                        "Server is busy",
                        "Please try again later",
                        "All retries failed",
                        "GPT API call error",
                        "GPT Vision API call failed",
                        "chatanywhere_error",
                        "INTERNAL_SERVER_ERROR",
                        "Third-party API call failed"
                    ]
                    is_error_response = any(keyword in result for keyword in error_keywords)
                    if is_error_response:
                        print(f" ❌ Detected API error response")
                        if api_attempt < max_retries - 1:
                            print(f" ⏳ Waiting {retry_delay} seconds before retrying API call...")
                            time.sleep(retry_delay)
                            continue
                        else:
                            break
                            
                    predicted_value = self.extract_predicted_confidence(result)
                    if predicted_value is not None:
                        print(f" ✅ Successfully obtained valid confidence score: {predicted_value}")
                        return result
                    else:
                        print(f" ⚠️ Unable to extract confidence score from response")
                        if best_result is None or len(result) > len(best_result):
                            best_result = result
                        if api_attempt < max_retries - 1:
                            print(f" 🔄 Confidence score extraction failed, retrying API call...")
                            time.sleep(retry_delay)
                            continue
                        else:
                            break
                            
                except Exception as e:
                    print(f" ❌ API call exception: {e}")
                    if api_attempt < max_retries - 1:
                        print(f" ⏳ Waiting {retry_delay} seconds before retrying API call...")
                        time.sleep(retry_delay)
                        continue
                    else:
                        break
            
            if validation_attempt < validation_retries - 1:
                wait_time = retry_delay * (validation_attempt + 1)
                print(f" ⏳ Validation retry failed, waiting {wait_time} seconds before next validation retry...")
                time.sleep(wait_time)
        
        if best_result:
            print(f" ⚠️ All retries failed to extract a valid confidence score, returning best response")
            return best_result
        else:
            print(f" ❌ All retries failed, returning empty string")
            return ""

    def call_third_party_gpt_api(self, messages: List[Dict], model_version: str) -> str:
        """
        Placeholder for calling a third-party GPT API.
        """
        # This is a mock function, in a real scenario it would call an external API.
        print("Mock: Calling third-party GPT API...")
        # Simulate a successful response
        return "Comparison result: 0.8"

    def extract_predicted_value(self, response: str, property_type: str) -> Optional[float]:
        """
        Extract the predicted value from the GPT response.
        """
        # This is a mock function, in a real scenario it would parse the response.
        if property_type == "friction":
            match = re.search(r"Estimated friction coefficient value: (\d+\.\d+)", response)
        elif property_type == "viscosity":
            match = re.search(r"Estimated viscosity value: (\d+\.\d+) cP", response)
        elif property_type == "elasticity":
            match = re.search(r"Estimated restitution coefficient value: (\d+\.\d+)", response)
        else:
            match = None
            
        if match:
            return float(match.group(1))
        return None
        
    def extract_predicted_confidence(self, response: str) -> Optional[float]:
        """
        Extract the predicted confidence score from the GPT response.
        """
        # This is a mock function, in a real scenario it would parse the response.
        match = re.search(r"Comparison result: (\d+\.\d+)", response)
        if match:
            return float(match.group(1))
        return None