import os.path as osp
import json
import pickle
from encoder import encode_sentences
from utils import compute_cosine_similarity, top_k_indices
import os
import cv2
#from database import DataBase
from moviepy.editor import VideoFileClip
import socket
import sys
from io import StringIO
import torch
from eva_clip_extractor import EVACLIPExtractor
import requests
import numpy as np
import base64
import gc
from captioning import Captioning

from ultralytics import YOLO, FastSAM, SAM, RTDETR, NAS

import tempfile

from segment_feature import SegmentFeature

import re
from sklearn.cluster import KMeans

sys.path.append('/root/autodl-tmp/VideoAgent-main/LLaVA')
from LLaVA.llava.model.builder import load_pretrained_model
from LLaVA.llava.mm_utils import get_model_name_from_path
from LLaVA.llava.eval.run_llava import eval_model
import PIL.Image as Image
import io
import cv2

model_cfgs = {
    'eva-clip-8b': {
        'model_name': 'EVA-CLIP-8B',
        'model_path': 'tool_models/EVA-CLIP-8B',
    }
}          


class ToolKit:
    # 创建模型缓存字典
    _shared_models = {
        'eva_clip': None,
        'tracking_model': None,
        'captioning': None,
        'llava_model': None,        # 新增 LLaVA 模型
        'llava_tokenizer': None,    # 新增 LLaVA tokenizer
        'llava_processor': None     # 新增 LLaVA 图像处理器
    }
    _initialized = False  # 确保单次初始化

    @classmethod
    def preload_models(cls):
        """预加载所有模型，返回模型字典"""
        if not cls._initialized:
            # 初始化EVA-CLIP-8B模型
            eva_clip = EVACLIPExtractor()
            
            # 加载 LLaVA 模型
            model_path = "/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b"
            llava_tokenizer, llava_model, llava_processor, llava_context_len = load_pretrained_model(
                model_path=model_path,
                model_base=None,
                model_name=get_model_name_from_path(model_path)
            )
            
            # 初始化共享模型字典
            shared_models = {
                'eva_clip': eva_clip,
                'captioning': None,  # 先设为 None，后续根据需要加载
                'llava_tokenizer': llava_tokenizer,
                'llava_model': llava_model,
                'llava_processor': llava_processor,
                'llava_context_len': llava_context_len
            }
            
            cls._shared_models = shared_models
            cls._initialized = True
        
        return cls._shared_models
    
    def __init__(self, video_path, base_dir='preprocess', vqa_tool='videollava', use_reid=True, openai_api_key='your_openai_api_key', shared_models=None, caption_model='lavila'):
        # 添加 caption_model 参数，默认为 'lavila'
        self.video_path = video_path
        self.base_dir = base_dir
        self.vqa_tool = vqa_tool
        self.openai_api_key = openai_api_key
        self.caption_model = caption_model  # 新增参数，用于选择 caption 模型

        # 从共享模型中获取模型
        self.eva_clip = shared_models['eva_clip']
        self.tracking_model = shared_models['tracking_model']
        
        # 根据 caption_model 决定是否加载 LaViLa 模型
        if self.caption_model == 'lavila':
            # 如果需要 LaViLa 模型但尚未加载，则加载它
            if shared_models['captioning'] is None:
                shared_models['captioning'] = Captioning()
            self.captioning = shared_models['captioning']
        else:
            # 对于其他模型（如 llava-next），不需要加载 LaViLa
            self.captioning = None
            
        # 新增 LLaVA 相关模型
        self.llava_tokenizer = shared_models['llava_tokenizer']
        self.llava_model = shared_models['llava_model']
        self.llava_processor = shared_models['llava_processor']
        self.llava_context_len = shared_models.get('llava_context_len', 2048)

        self.video_path = video_path
        base_name = os.path.basename(video_path).replace(".mp4", "")
        self.video_dir = os.path.join(base_dir, base_name)
        assert vqa_tool in ["videollava", "gpt-4v"]
        self.vqa_tool = vqa_tool
        cap = cv2.VideoCapture(video_path)
        self.fps = round(cap.get(cv2.CAP_PROP_FPS))

        self.total_frames = round(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        cap.release()
        self.openai_api_key = openai_api_key

        self.max_seconds = self.total_frames / self.fps

        # 根据 caption_model 设置 seconds_per_caption
        if self.caption_model == 'lavila' and self.captioning is not None:
            self.seconds_per_caption = self.captioning.seconds_per_caption
        else:
            # 对于 llava-next，设置一个默认值或从配置中获取
            self.seconds_per_caption = 2.0  # 默认值，可以根据需要调整

        self.segment_feature = SegmentFeature(video_path=video_path,
                                              base_dir=base_dir)
        # 使用新的fps_sampling参数，每秒1帧采样
        self.seconds_per_feat = 1.0 / self.segment_feature.fps_sampling

        self.frames_per_object_tracking_segment = self.seconds_per_caption * self.fps

        # Initialize tracking model
        #self.tracking_model = RTDETR('tool_models/tracking/rtdetr-l.pt')

    @classmethod
    def preload_models(cls):
        """预加载所有模型，返回模型字典"""
        if not cls._initialized:
            # 初始化EVA-CLIP-8B模型
            eva_clip = EVACLIPExtractor()
            
            # 加载 LLaVA 模型
            model_path = "/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b"
            llava_tokenizer, llava_model, llava_processor, llava_context_len = load_pretrained_model(
                model_path=model_path,
                model_base=None,
                model_name=get_model_name_from_path(model_path)
            )
            
            cls._shared_models.update({
                'eva_clip': eva_clip,
                'tracking_model': RTDETR('tool_models/tracking/rtdetr-l.pt'),
                'captioning': Captioning(),
                'llava_tokenizer': llava_tokenizer,
                'llava_model': llava_model,
                'llava_processor': llava_processor,
                'llava_context_len': llava_context_len
            })
            cls._initialized = True
        return cls._shared_models

    def _merge_continuous_spans(self, indices):
        if not indices:
            return []

        indices = sorted(indices)
        merged = []
        current_start = indices[0]
        current_end = indices[0]

        for idx in indices[1:]:
            if idx == current_end + 1:
                current_end = idx
            else:
                merged.append((current_start, current_end))
                current_start = current_end = idx
        merged.append((current_start, current_end))

        return merged

    def time_focused_analysis(self, start_time, end_time, k=5):
        """
        在指定时间段内识别关键场景并生成字幕。
        
        Args:
            start_time: 开始时间（秒）
            end_time: 结束时间（秒）
            k: 聚类数量，默认为5
            
        Returns:
            包含时间戳和相应字幕的字符串
        """
        try:
            print(f'Time focused analysis - start_time: {start_time}s, end_time: {end_time}s, k={k}')
            
            # 参数验证
            if start_time >= end_time or start_time < 0:
                return "Invalid time span: start time must be less than end time and non-negative"

            if end_time > self.max_seconds:
                end_time = self.max_seconds
                print(f"Warning: end_time adjusted to video duration {end_time:.2f}s")
            
            # 加载视觉特征
            visual_embedding_path = osp.join(self.video_dir, 'visual_embedding.pkl')
            if not osp.exists(visual_embedding_path):
                # 如果不存在，则提取视频特征
                self.segment_feature.create_visual_embedding(eva_clip_model=self.eva_clip)

            with open(visual_embedding_path, 'rb') as f:
                video_visual_emb = pickle.load(f)

            # 转换为时间索引
            total_frames = len(video_visual_emb)
            time_points = np.arange(total_frames) * self.seconds_per_feat
            valid_indices = np.where((time_points >= start_time) & (time_points <= end_time))[0]
            
            if len(valid_indices) == 0:
                return f"No segments found in the specified time range {start_time:.2f}s to {end_time:.2f}s"
            
            if len(valid_indices) < k:
                k = len(valid_indices)
                print(f"Warning: Adjusted k to {k} due to limited segments in time range")
            
            # 获取指定时间段内的特征
            segment_features = video_visual_emb[valid_indices]
            
            # 进行k-means聚类
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            cluster_labels = kmeans.fit_predict(segment_features)
            cluster_centers = kmeans.cluster_centers_
            
            # 找到每个聚类中最靠近聚类中心的帧
            selected_frames = []
            for cluster_id in range(k):
                # 找到属于当前聚类的所有帧
                cluster_mask = cluster_labels == cluster_id
                cluster_indices = valid_indices[cluster_mask]
                cluster_segment_features = segment_features[cluster_mask]
                
                if len(cluster_segment_features) == 0:
                    continue
                
                # 计算每个帧特征与聚类中心的距离
                distances = np.linalg.norm(cluster_segment_features - cluster_centers[cluster_id], axis=1)
                
                # 找到距离最小的帧
                closest_frame_idx = np.argmin(distances)
                selected_frame_global_idx = cluster_indices[closest_frame_idx]
                selected_frames.append(selected_frame_global_idx)
            
            # 按时间顺序排序选中的帧
            selected_frames.sort()
            
            # 生成字幕 - 根据不同的 caption_model 选择不同的处理方式
            if self.caption_model == 'lavila':
                # LaViLa 处理逻辑
                captions = []
                for idx in selected_frames:
                    segment_start_time = idx * self.seconds_per_feat
                    caption = self.captioning.generate_caption_for_single_segment(
                        self.video_path, segment_start_time)
                    captions.append((segment_start_time, caption))
                
                # 格式化输出
                result = f"Key scenes analysis for time range {start_time:.2f}s to {end_time:.2f}s:\n\n"
                for time, caption in captions:
                    result += f"[{time:.2f}s]: {caption}\n"
                    
                return result
                
            elif self.caption_model == 'llava-next':
                # LLava-NeXT 处理逻辑
                # 打开视频
                cap = cv2.VideoCapture(self.video_path)
                if not cap.isOpened():
                    return "Unable to open video file"
                
                results = []
                
                # 对每个选定的时间点进行处理
                for idx in selected_frames:
                    time_point = idx * self.seconds_per_feat
                    
                    # 计算帧索引
                    frame_idx = int(time_point * self.fps)
                    
                    # 设置视频位置并读取帧
                    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                    success, frame = cap.read()
                    
                    if not success:
                        # 如果读取失败且帧索引等于总帧数，尝试读取最后一帧
                        if frame_idx == self.total_frames:
                            last_frame_idx = self.total_frames - 1
                            cap.set(cv2.CAP_PROP_POS_FRAMES, last_frame_idx)
                            success, frame = cap.read()
                            if not success:
                                results.append((time_point, "Unable to read frame"))
                                continue
                        else:
                            results.append((time_point, "Unable to read frame"))
                            continue
                    
                    # 将 OpenCV 格式转换为 PIL 格式
                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    pil_image = Image.fromarray(frame_rgb)
                    
                    # 保存图像到临时文件
                    temp_image_path = f"/tmp/frame_{time_point}.jpg"
                    pil_image.save(temp_image_path)
                    
                    # 使用 LLava-NeXT (LLaVA) 生成描述
                    description_prompt = "Describe this image briefly, focusing on the main activities and objects."
                    
                    # 创建 LLaVA 推理参数
                    args = type('Args', (), {
                        "model_path": "/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b/",
                        "model_base": None,
                        "model_name": get_model_name_from_path("/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b/"),
                        "query": description_prompt,
                        "conv_mode": None,
                        "image_file": temp_image_path,
                        "sep": ",",
                        "temperature": 0,
                        "top_p": None,
                        "num_beams": 1,
                        "max_new_tokens": 256  # 较短的描述
                    })()
                    
                    # 捕获标准输出以获取 LLaVA 的输出
                    old_stdout = sys.stdout
                    sys.stdout = mystdout = StringIO()
                    
                    try:
                        # 直接调用 eval_model，传入已加载的模型和处理器
                        eval_model(
                            args,
                            tokenizer=self.llava_tokenizer,
                            model=self.llava_model,
                            image_processor=self.llava_processor,
                            context_len=self.llava_context_len
                        )
                        description = mystdout.getvalue().strip()
                        results.append((time_point, description))
                    except Exception as e:
                        results.append((time_point, f"Error generating description: {str(e)}"))
                    finally:
                        # 恢复标准输出
                        sys.stdout = old_stdout
                    
                    # 删除临时文件
                    try:
                        os.remove(temp_image_path)
                    except:
                        pass
                
                cap.release()
                
                # 格式化输出
                output = f"Key scenes analysis for time range {start_time:.2f}s to {end_time:.2f}s:\n\n"
                for time_point, description in results:
                    output += f"[{time_point:.2f}s]: {description}\n\n"
                
                return output
            
            else:
                return f"Unsupported caption model: {self.caption_model}. Supported models are 'lavila' and 'llava-next'."
                
        except Exception as e:
            return f"Error in time_focused_analysis: {str(e)}"

    def key_frame_selection(self, query, start_time, end_time, k=5):
        """
        使用分水岭算法找到与查询最相关的视频片段并生成字幕。
        
        Args:
            query: 文本查询
            start_time: 开始时间（秒）
            end_time: 结束时间（秒）
            k: 返回的片段数量
            
        Returns:
            包含时间戳和相应字幕的字符串
        """
        print('query, start_time, end_time:', query, start_time, end_time)
        # 参数验证
        if start_time >= end_time or start_time < 0:
            return "Invalid time span: start time must be less than end time and non-negative"

        if end_time > self.max_seconds:
            end_time = self.max_seconds
        
        # 加载视觉特征
        visual_embedding_path = osp.join(self.video_dir, 'visual_embedding.pkl')
        if not osp.exists(visual_embedding_path):
            self.segment_feature.create_visual_embedding(eva_clip_model=self.eva_clip)

        with open(visual_embedding_path, 'rb') as f:
            video_visual_emb = pickle.load(f)

        # 计算文本特征
        with torch.no_grad():
            des2visual_emb = self.eva_clip.get_text_features(query).cpu().numpy()

        # 计算相似度分数
        visual_scores = compute_cosine_similarity(des2visual_emb, video_visual_emb)

        # 转换为时间索引
        total_frames = len(video_visual_emb)
        time_points = np.arange(total_frames) * self.seconds_per_feat
        valid_indices = np.where((time_points >= start_time) & (time_points <= end_time))[0]
        
        if len(valid_indices) == 0:
            return f"No segments found in the specified time range"
        
        # 获取有效索引的分数
        valid_scores = visual_scores[valid_indices]
        
        # 实现分水岭算法
        # 1. 平滑相似度分数（使用滑动窗口平均）
        window_size = 3  # 滑动窗口大小
        smoothed_scores = np.zeros_like(valid_scores)
        padded_scores = np.pad(valid_scores, (window_size//2, window_size//2), mode='edge')
        
        for i in range(len(valid_scores)):
            smoothed_scores[i] = np.mean(padded_scores[i:i+window_size])
        
        # 2. 计算平均相似度分数作为阈值
        mean_score = np.mean(smoothed_scores)
        
        # 3. 识别峰值区域（分数大于平均值的区域）
        peak_mask = smoothed_scores > mean_score
        
        # 4. 找到连续的峰值区域
        peak_regions = []
        in_peak = False
        start_idx = 0
        
        for i, is_peak in enumerate(peak_mask):
            if is_peak and not in_peak:
                # 进入峰值区域
                in_peak = True
                start_idx = i
            elif not is_peak and in_peak:
                # 离开峰值区域
                in_peak = False
                peak_regions.append((start_idx, i-1))
        
        # 处理最后一个峰值区域
        if in_peak:
            peak_regions.append((start_idx, len(peak_mask)-1))
        
        # 5. 从每个峰值区域选择得分最高的帧
        top_frames = []
        for start_idx, end_idx in peak_regions:
            region_scores = smoothed_scores[start_idx:end_idx+1]
            max_score_idx = start_idx + np.argmax(region_scores)
            top_frames.append(valid_indices[max_score_idx])
        
        # 如果峰值区域少于k，按分数排序添加更多帧
        if len(top_frames) < k:
            # 排除已选择的帧
            remaining_indices = [idx for idx in range(len(valid_indices)) if valid_indices[idx] not in top_frames]
            remaining_scores = smoothed_scores[remaining_indices]
            
            # 按分数排序
            sorted_indices = np.argsort(-remaining_scores)
            additional_frames = [valid_indices[remaining_indices[idx]] for idx in sorted_indices[:k-len(top_frames)]]
            top_frames.extend(additional_frames)
        
        # 如果峰值区域多于k，只保留得分最高的k个
        elif len(top_frames) > k:
            frame_scores = [visual_scores[idx] for idx in top_frames]
            sorted_indices = np.argsort(-np.array(frame_scores))
            top_frames = [top_frames[idx] for idx in sorted_indices[:k]]
        
        # 生成字幕 - 根据不同的 caption_model 选择不同的处理方式
        if self.caption_model == 'lavila':
            # LaViLa 处理逻辑
            captions = []
            for idx in top_frames:
                segment_start_time = idx * self.seconds_per_feat
                caption = self.captioning.generate_caption_for_single_segment(
                    self.video_path, segment_start_time)
                captions.append((segment_start_time, caption))
            
            # 按时间顺序排序字幕
            captions.sort(key=lambda x: x[0])
                
            # 格式化输出
            result = ""
            for time, caption in captions:
                result += f"[{time:.2f}s]: {caption}\n"
                
            return result
            
        elif self.caption_model == 'llava-next':
            # LLava-NeXT 处理逻辑
            # 打开视频
            cap = cv2.VideoCapture(self.video_path)
            if not cap.isOpened():
                return "Unable to open video file"
            
            results = []
            
            # 对每个选定的时间点进行处理
            for idx in top_frames:
                time_point = idx * self.seconds_per_feat
                
                # 计算帧索引
                frame_idx = int(time_point * self.fps)
                
                # 设置视频位置并读取帧
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                success, frame = cap.read()
                
                if not success:
                    # 如果读取失败且帧索引等于总帧数，尝试读取最后一帧
                    if frame_idx == self.total_frames:
                        last_frame_idx = self.total_frames - 1
                        cap.set(cv2.CAP_PROP_POS_FRAMES, last_frame_idx)
                        success, frame = cap.read()
                        if not success:
                            results.append((time_point, "Unable to read frame"))
                            continue
                    else:
                        results.append((time_point, "Unable to read frame"))
                        continue
                
                # 将 OpenCV 格式转换为 PIL 格式
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(frame_rgb)
                
                # 保存图像到临时文件
                temp_image_path = f"/tmp/frame_{time_point}.jpg"
                pil_image.save(temp_image_path)
                
                # 使用 LLava-NeXT (LLaVA) 生成描述
                description_prompt = f"Describe this image briefly, focusing on elements related to '{query}'."
                
                # 创建 LLaVA 推理参数
                args = type('Args', (), {
                    "model_path": "/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b/",
                    "model_base": None,
                    "model_name": get_model_name_from_path("/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b/"),
                    "query": description_prompt,
                    "conv_mode": None,
                    "image_file": temp_image_path,
                    "sep": ",",
                    "temperature": 0,
                    "top_p": None,
                    "num_beams": 1,
                    "max_new_tokens": 256  # 较短的描述
                })()
                
                # 捕获标准输出以获取 LLaVA 的输出
                old_stdout = sys.stdout
                sys.stdout = mystdout = StringIO()
                
                try:
                    # 直接调用 eval_model，传入已加载的模型和处理器
                    eval_model(
                        args,
                        tokenizer=self.llava_tokenizer,
                        model=self.llava_model,
                        image_processor=self.llava_processor,
                        context_len=self.llava_context_len
                    )
                    description = mystdout.getvalue().strip()
                    results.append((time_point, description))
                except Exception as e:
                    results.append((time_point, f"Error generating description: {str(e)}"))
                finally:
                    # 恢复标准输出
                    sys.stdout = old_stdout
                
                # 删除临时文件
                try:
                    os.remove(temp_image_path)
                except:
                    pass
            
            cap.release()
            
            # 按时间点排序结果
            results.sort(key=lambda x: x[0])
            
            # 格式化输出
            output = f"LLava-NeXT captions related to '{query}' in time range {start_time:.2f}s to {end_time:.2f}s:\n\n"
            for time_point, description in results:
                output += f"[{time_point:.2f}s]: {description}\n\n"
            
            return output
        
        else:
            return f"Unsupported caption model: {self.caption_model}. Supported models are 'lavila' and 'llava-next'."

    def frame_analysis(self, input_list):
        """
        Analyze spatial relationships and visual attributes at specific time points in the video.
        
        Args:
            input_list: 列表，每个元素为 (问题, 时间点) 的元组
            
        Returns:
            包含多个时间点问题回答的字符串
        """
        # 参数验证
        if not isinstance(input_list, list):
            return "Please provide a valid list of questions and time points in the format: [(question1, time_point1), (question2, time_point2), ...]"
        
        if len(input_list) == 0:
            return "The provided list of questions and time points is empty"
            
        # 打开视频
        cap = cv2.VideoCapture(self.video_path)
        if not cap.isOpened():
            return "Unable to open video file"
        
        all_results = []
        
        # 处理每个问题和时间点
        for question, time_point in input_list:
            # 确保时间点在有效范围内
            if not isinstance(time_point, (int, float)):
                all_results.append((time_point, question, "Invalid time point"))
                continue
                
            if time_point < 0 or time_point > self.max_seconds:
                all_results.append((time_point, question, f"Time point {time_point}s is outside the video range (0-{self.max_seconds}s)"))
                continue
                
            # 计算帧索引
            frame_idx = int(time_point * self.fps)
            
            # 设置视频位置并读取帧
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            success, frame = cap.read()
            
            if not success:
                # 如果读取失败且帧索引等于总帧数，尝试读取最后一帧
                if frame_idx == self.total_frames:
                    last_frame_idx = self.total_frames - 1
                    cap.set(cv2.CAP_PROP_POS_FRAMES, last_frame_idx)
                    success, frame = cap.read()
                    if success:
                        print(f"Warning: Time point {time_point}s corresponds to the total number of frames, using the last frame instead")
                    else:
                        all_results.append((time_point, question, "Unable to read frame"))
                        continue
                else:
                    all_results.append((time_point, question, "Unable to read frame"))
                    continue
                
            # 将 OpenCV 格式转换为 PIL 格式
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame_rgb)
            
            # 保存图像到临时文件
            temp_image_path = f"/tmp/frame_{time_point}.jpg"
            pil_image.save(temp_image_path)
            
            # 直接回答关于帧的问题，专注于空间理解
            prompt = f"""Analyze the scene at {time_point} seconds in the video and answer the following question:

            {question}

            Focus on:
            1. Visual attributes (colors, shapes, sizes, textures)
            2. Spatial relationships between objects
            3. Scene composition and layout
            4. Object identification and properties

            IMPORTANT: Provide a brief and concise answer using only a few words or short phrases."""
            
            # 创建 LLaVA 推理参数
            args = type('Args', (), {
                "model_path": "/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b/",
                "model_base": None,
                "model_name": get_model_name_from_path("/root/autodl-tmp/VideoAgent-main/tool_models/llava-v1.6-vicuna-7b/"),
                "query": prompt,
                "conv_mode": None,
                "image_file": temp_image_path,
                "sep": ",",
                "temperature": 0,
                "top_p": None,
                "num_beams": 1,
                "max_new_tokens": 512
            })()
            
            # 捕获标准输出以获取 LLaVA 的输出
            old_stdout = sys.stdout
            sys.stdout = mystdout = StringIO()
            
            try:
                # 直接调用 eval_model，传入已加载的模型和处理器
                eval_model(
                    args,
                    tokenizer=self.llava_tokenizer,
                    model=self.llava_model,
                    image_processor=self.llava_processor,
                    context_len=self.llava_context_len
                )
                answer = mystdout.getvalue().strip()
            except Exception as e:
                answer = f"LLaVA inference error: {str(e)}"
            finally:
                # 恢复标准输出
                sys.stdout = old_stdout
            
            # 删除临时文件
            try:
                os.remove(temp_image_path)
            except:
                pass
            
            # 添加结果
            all_results.append((time_point, question, answer))
        
        cap.release()
        
        # 按时间点排序结果
        all_results.sort(key=lambda x: x[0])
        
        # 格式化输出
        output = ""
        for time_point, question, answer in all_results:
            output += f"Question at {time_point:.2f}s: {question}\nAnswer: {answer}\n\n"
        
        return output.strip()
    


