from __future__ import annotations

import os
import warnings
import base64
import json
import time
import json_repair
import re
from typing import Optional, Tuple

from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.vlm.qwen2_vl.prompt import Qwen2VLPromptMixin
from vlmeval.api.qwen_api import QwenAPI


def ensure_image_url(image: str) -> str:
    prefixes = ['http://', 'https://', 'file://', 'data:image;']
    if any(image.startswith(prefix) for prefix in prefixes):
        return image
    if os.path.exists(image):
        return 'file://' + image
    raise ValueError(f'Invalid image: {image}')


class Qwen2VLAPI(Qwen2VLPromptMixin, BaseAPI):
    is_api: bool = True

    def __init__(
        self,
        model: str = 'qwen-vl-max-0809',
        key: str | None = None,
        min_pixels: int | None = None,
        max_pixels: int | None = None,
        max_length=4096,
        top_p=0.001,
        top_k=1,
        temperature=0.01,
        repetition_penalty=1.0,
        presence_penalty=0.0,
        seed=3407,
        use_custom_prompt: bool = True,
        **kwargs,
    ):
        import dashscope

        self.model = model
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.generate_kwargs = dict(
            max_length=max_length,
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            presence_penalty=presence_penalty,
            seed=seed,
        )

        key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key
        assert key is not None, (
            'Please set the API Key (obtain it here: '
            'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
        )
        dashscope.api_key = key
        super().__init__(use_custom_prompt=use_custom_prompt, **kwargs)

    def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
        """
        inputs list[dict[str, str]], each dict has keys: ['type', 'value']
        """
        content = []
        for s in inputs:
            if s['type'] == 'image':
                item = {'type': 'image', 'image': ensure_image_url(s['value'])}
                if dataset == 'OCRBench':
                    item['min_pixels'] = 10 * 10 * 28 * 28
                    warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
                    if self.max_pixels is not None:
                        item['max_pixels'] = self.max_pixels
                else:
                    if self.min_pixels is not None:
                        item['min_pixels'] = self.min_pixels
                    if self.max_pixels is not None:
                        item['max_pixels'] = self.max_pixels
            elif s['type'] == 'text':
                item = {'type': 'text', 'text': s['value']}
            else:
                raise ValueError(f"Invalid message type: {s['type']}, {s}")
            content.append(item)
        return content

    def generate_inner(self, inputs, **kwargs) -> str:
        import dashscope

        messages = []
        if self.system_prompt is not None:
            messages.append({'role': 'system', 'content': self.system_prompt})
        messages.append(
            {'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))}
        )
        if self.verbose:
            print(f'\033[31m{messages}\033[0m')

        # generate
        generation_kwargs = self.generate_kwargs.copy()
        kwargs.pop('dataset', None)
        generation_kwargs.update(kwargs)
        try:
            response = dashscope.MultiModalConversation.call(
                model=self.model,
                messages=messages,
                **generation_kwargs,
            )
            if self.verbose:
                print(response)
            answer = response.output.choices[0]['message']['content'][0]['text']
            return 0, answer, 'Succeeded! '
        except Exception as err:
            if self.verbose:
                self.logger.error(f'{type(err)}: {err}')
                self.logger.error(f'The input messages are {inputs}.')
            return -1, '', ''


class QVQAPI(Qwen2VLPromptMixin, BaseAPI):
    is_api: bool = True

    def __init__(
        self,
        model: str = 'qvq-max',
        key: str | None = None,
        min_pixels: int | None = None,
        max_pixels: int | None = None,
        max_length=4096,
        top_p=0.001,
        top_k=1,
        temperature=0.01,
        repetition_penalty=1.0,
        presence_penalty=0.0,
        seed=3407,
        use_custom_prompt: bool = True,
        verbose: bool = False,
        **kwargs,
    ):
        import dashscope

        self.model = model
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.generate_kwargs = dict(
            max_length=max_length,
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            presence_penalty=presence_penalty,
            seed=seed,
        )

        key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key
        assert key is not None, (
            'Please set the API Key (obtain it here: '
            'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
        )
        dashscope.api_key = key
        super().__init__(use_custom_prompt=use_custom_prompt, verbose=verbose, **kwargs)

    def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
        """
        inputs list[dict[str, str]], each dict has keys: ['type', 'value']
        """
        content = []
        for s in inputs:
            if s['type'] == 'image':
                item = {'image': ensure_image_url(s['value'])}
                if dataset == 'OCRBench':
                    item['min_pixels'] = 10 * 10 * 28 * 28
                    warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
                    if self.max_pixels is not None:
                        item['max_pixels'] = self.max_pixels
                else:
                    if self.min_pixels is not None:
                        item['min_pixels'] = self.min_pixels
                    if self.max_pixels is not None:
                        item['max_pixels'] = self.max_pixels
            elif s['type'] == 'text':
                item = {'text': s['value']}
            else:
                raise ValueError(f"Invalid message type: {s['type']}, {s}")
            content.append(item)
        return content

    def generate_inner(self, inputs, **kwargs) -> str:
        import dashscope

        messages = []
        if self.system_prompt is not None:
            messages.append({'role': 'system', 'content': self.system_prompt})
        messages.append(
            {'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))}
        )
        if self.verbose:
            print(f'\033[31m{messages}\033[0m')

        # generate
        generation_kwargs = self.generate_kwargs.copy()
        kwargs.pop('dataset', None)
        generation_kwargs.update(kwargs)
        try:

            response = dashscope.MultiModalConversation.call(
                model=self.model,
                messages=messages,
                stream=True,
                enable_thinking=True,
                **generation_kwargs,
            )
            # if self.verbose:
            #     print(response)
                # import time
                # time.sleep(1000)
            # 定义完整思考过程
            reasoning_content = ""# 定义完整回复
            answer_content = ""# 判断是否结束思考过程并开始回复
            is_answering = False
            if self.verbose:
                print("=" * 20 + "思考过程" + "=" * 20)

            for chunk in response:
                # 如果思考过程与回复皆为空，则忽略
                message = chunk.output.choices[0].message
                reasoning_content_chunk = message.get("reasoning_content", None)
                if (chunk.output.choices[0].message.content == [] and
                    reasoning_content_chunk == ""):
                    pass
                else:
                    # 如果当前为思考过程
                    if reasoning_content_chunk != None and chunk.output.choices[0].message.content == []:
                        if self.verbose:
                            print(chunk.output.choices[0].message.reasoning_content, end="")
                        reasoning_content += chunk.output.choices[0].message.reasoning_content
                    # 如果当前为回复
                    elif chunk.output.choices[0].message.content != []:
                        if not is_answering:
                            if self.verbose:
                                print("\n" + "=" * 20 + "完整回复" + "=" * 20)
                            is_answering = True
                        if self.verbose:
                            print(chunk.output.choices[0].message.content[0]["text"], end="")
                        answer_content += chunk.output.choices[0].message.content[0]["text"]

            # 如果您需要打印完整思考过程与完整回复，请将以下代码解除注释后运行# print("=" * 20 + "完整思考过程" + "=" * 20 + "\n")# print(f"{reasoning_content}")# print("=" * 20 + "完整回复" + "=" * 20 + "\n")# print(f"{answer_content}")
            # if self.verbose:
            #     print(response)
            # answer = response.output.choices[0]['message']['content'][0]['text']
            return 0, answer_content, 'Succeeded! '
        except Exception as err:
            if self.verbose:
                self.logger.error(f'{type(err)}: {err}')
                self.logger.error(f'The input messages are {inputs}.')
            return -1, '', ''

class Qwen3VLAPI(Qwen2VLPromptMixin, BaseAPI):
    is_api: bool = True

    def __init__(
        self,
        model: str = 'qwen3-vl-plus',
        key: str | None = None,
        min_pixels: int | None = None,
        max_pixels: int | None = None,
        max_length=4096,
        top_p=0.001,
        top_k=1,
        temperature=0.01,
        repetition_penalty=1.0,
        presence_penalty=0.0,
        seed=3407,
        thinking_budget: int | None = 500,
        enable_thinking: bool = True,
        use_custom_prompt: bool = True,
        verbose: bool = False,
        **kwargs,
    ):
        import dashscope

        self.model = model
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.enable_thinking = enable_thinking
        self.thinking_budget = thinking_budget

        self.generate_kwargs = dict(
            max_length=max_length,
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            presence_penalty=presence_penalty,
            seed=seed,
        )

        # API Key 获取
        self.api_key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key
        assert self.api_key is not None, (
            'Please set the API Key (obtain it here: '
            'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
        )

        super().__init__(use_custom_prompt=use_custom_prompt, verbose=verbose, **kwargs)

    def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
        """
        inputs: list[dict[str,str]] with keys ['type','value']
        Qwen3VL messages content does not use "type" field for image/text, it is simply {"image": url} or {"text": str}
        """
        content = []
        for s in inputs:
            if s['type'] == 'image':
                item = {'image': ensure_image_url(s['value'])}
                if dataset == 'OCRBench':
                    item['min_pixels'] = 10 * 10 * 28 * 28
                    warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
                    if self.max_pixels is not None:
                        item['max_pixels'] = self.max_pixels
                else:
                    if self.min_pixels is not None:
                        item['min_pixels'] = self.min_pixels
                    if self.max_pixels is not None:
                        item['max_pixels'] = self.max_pixels
            elif s['type'] == 'text':
                item = {'text': s['value']}
            else:
                raise ValueError(f"Invalid message type: {s['type']}, {s}")
            content.append(item)
        return content

    def generate_inner(self, inputs, **kwargs) -> str:
        import dashscope
        from dashscope import MultiModalConversation

        messages = []
        if self.system_prompt is not None:
            messages.append({'role': 'system', 'content': [{"text": self.system_prompt}]})

        messages.append({
            'role': 'user',
            'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))
        })

        if self.verbose:
            print(f'\033[31m{messages}\033[0m')

        generation_kwargs = self.generate_kwargs.copy()
        kwargs.pop('dataset', None)
        generation_kwargs.update(kwargs)

        try:
            response = MultiModalConversation.call(
                api_key=self.api_key,
                model=self.model,
                messages=messages,
                stream=True,
                enable_thinking=self.enable_thinking,
                thinking_budget=self.thinking_budget,
                **generation_kwargs
            )

            reasoning_content = ""
            answer_content = ""
            is_answering = False

            if self.verbose:
                print("=" * 20 + "思考过程" + "=" * 20)

            for chunk in response:
                message = chunk.output.choices[0].message
                reasoning_content_chunk = message.get("reasoning_content", None)

                if (chunk.output.choices[0].message.content == [] and reasoning_content_chunk == ""):
                    continue
                else:
                    if reasoning_content_chunk is not None and chunk.output.choices[0].message.content == []:
                        if self.verbose:
                            print(message.reasoning_content, end="")
                        reasoning_content += message.reasoning_content
                    elif chunk.output.choices[0].message.content != []:
                        if not is_answering:
                            if self.verbose:
                                print("\n" + "=" * 20 + "完整回复" + "=" * 20)
                            is_answering = True
                        if self.verbose:
                            print(message.content[0]["text"], end="")
                        answer_content += message.content[0]["text"]

            return 0, answer_content, 'Succeeded! '

        except Exception as err:
            if self.verbose:
                self.logger.error(f'{type(err)}: {err}')
                self.logger.error(f'The input messages are {inputs}.')
            return -1, '', ''



class QwenVLWrapper(BaseAPI):

    is_api: bool = True

    def __init__(self,
                 model: str = 'qwen-vl-plus',
                 retry: int = 5,
                 wait: int = 5,
                 key: str = None,
                 verbose: bool = False,
                 temperature: float = 0.0,
                 system_prompt: str = None,
                 max_tokens: int = 8196,
                 proxy: str = None,
                 **kwargs):

        # assert model in ['qwen-vl-plus', 'qwen-vl-max']
        self.model = model
        import dashscope
        self.fail_msg = 'Failed to obtain answer via API. '
        self.max_tokens = max_tokens
        self.temperature = temperature
        if key is None:
            key = os.environ.get('DASHSCOPE_API_KEY', None)
        assert key is not None, (
            'Please set the API Key (obtain it here: '
            'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
        )
        dashscope.api_key = key
        if proxy is not None:
            proxy_set(proxy)
        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)

    # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
    # content can be a string or a list of image & text
    def prepare_itlist(self, inputs):
        assert np.all([isinstance(x, dict) for x in inputs])
        has_images = np.sum([x['type'] == 'image' for x in inputs])
        if has_images:
            content_list = []
            for msg in inputs:
                if msg['type'] == 'text':
                    content_list.append(dict(text=msg['value']))
                elif msg['type'] == 'image':
                    content_list.append(dict(image='file://' + msg['value']))
        else:
            assert all([x['type'] == 'text' for x in inputs])
            text = '\n'.join([x['value'] for x in inputs])
            content_list = [dict(text=text)]
        return content_list

    def prepare_inputs(self, inputs):
        input_msgs = []
        if self.system_prompt is not None:
            input_msgs.append(dict(role='system', content=self.system_prompt))
        assert isinstance(inputs, list) and isinstance(inputs[0], dict)
        assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
        if 'role' in inputs[0]:
            assert inputs[-1]['role'] == 'user', inputs[-1]
            for item in inputs:
                input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
        else:
            input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
        return input_msgs

    def generate_inner(self, inputs, **kwargs) -> str:
        from dashscope import MultiModalConversation
        assert isinstance(inputs, str) or isinstance(inputs, list)

        if 'type' in inputs[0]:
            pure_text = np.all([x['type'] == 'text' for x in inputs])
        else:
            pure_text = True
            for inp in inputs:
                if not np.all([x['type'] == 'text' for x in inp['content']]):
                    pure_text = False
                    break

        assert not pure_text
        messages = self.prepare_inputs(inputs)
        gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
        gen_config.update(kwargs)
        try:
            response = MultiModalConversation.call(model=self.model, messages=messages)
            if self.verbose:
                print(response)
            answer = response.output.choices[0]['message']['content'][0]['text']
            return 0, answer, 'Succeeded! '
        except Exception as err:
            if self.verbose:
                self.logger.error(f'{type(err)}: {err}')
                self.logger.error(f'The input messages are {inputs}.')

            return -1, '', ''


class QwenVLAPI(QwenVLWrapper):

    def generate(self, message, dataset=None):
        return super(QwenVLAPI, self).generate(message)


class QwenVLMultiStepAPI(Qwen2VLPromptMixin, BaseAPI):
    is_api: bool = True
    def __init__(self, 
                 vl_model: str = 'qwen-vl-max',
                 lm_model: str = 'qwen3-32b',
                 use_custom_prompt: bool = True,
                 verbose: bool = False,
                 enable_thinking : bool = True,
                 visual_enable_thinking : bool = False,
                 round : int = 2,
                 retry: Optional[int] = 10,
                 **kwargs):
        super().__init__(use_custom_prompt=use_custom_prompt, verbose=verbose, **kwargs)
        if "qwen3-vl" in vl_model:
            self.vl_api = Qwen3VLAPI(model=vl_model, retry=retry, verbose=verbose, enable_thinking=visual_enable_thinking, **kwargs)
        else:
            self.vl_api = Qwen2VLAPI(model=vl_model, retry=retry, verbose=verbose, **kwargs)
        self.lm_api = QwenAPI(model=lm_model, retry=retry, verbose=verbose, enable_thinking=enable_thinking, **kwargs)
        self.retry = retry
        self.round = round

    def parse_json_text_with_remaining(self, raw_response: str, print_fail_resp: bool = False,) -> Tuple[Optional[dict], Optional[str]]:
        pattern = r'```json(.*?)```'
        # Use re.findall to extract all matches
        try:
            matches = re.findall(pattern, raw_response, re.DOTALL)
            if not matches:
                matches = [raw_response]
            if len(matches) == 1:
                match_text: str = matches[0].strip()

                formatted_response = json_repair.loads(match_text)
                if not formatted_response and print_fail_resp:
                    print("=====")
                    print(f"{raw_response}")
                    print("=====")
            else:
                formatted_response = [json_repair.loads(match_text) for match_text in matches]
            
            if type(formatted_response) is dict:
                for k, v in formatted_response.items():
                    if not v or type(v) is not str:
                        continue
                    formatted_response[k] = v.replace("\\n", "\n")
            remaining_text = re.sub(pattern, '', raw_response, flags=re.DOTALL).strip()
            return formatted_response, remaining_text
        except Exception as e:  # noqa: F841
            # raise e
            print("Fail to parse one output.")
            if print_fail_resp:
                print("=====")
                print(f"{raw_response}")
                print("=====")
            return None, None

    def generate_inner(self, inputs, **kwargs):
        """
        inputs: list[dict], 每个dict包含type和value，type为'text'或'image'。
        """
        # 1. 解析输入
        text = ''
        image_path = []
        for item in inputs:
            if item['type'] == 'text':
                text += item['value']
            elif item['type'] == 'image':
                image_path.append(item['value'])
        assert text or image_path, '输入必须包含文本或图片'

        qa_process = {
            "timestamp": time.strftime("%Y%m%d_%H%M%S"),
            "images": image_path,
            "question": text,
            "process": []
        }

        # 2. 构造初始视觉分析prompt
        initial_prompt = ""
        if image_path:
            # yincang
            # initial_prompt = (
            #     "You are a professional visual analysis assistant. Please provide your analysis in JSON format with two parts:\n\n"
            #     "1. reasoning: Detailed analysis considering:\n"
            #     "   - Main visual elements in the image(s)\n"
            #     "   - Spatial relationships between these elements\n"
            #     "   - Any details that might help answer the question\n"
            #     "   - If multiple images, describe each and their relationships\n"
            #     "2. answer: Direct response to the question\n\n"
            #     f"Question: {text}\n\n"
            #     "Please return your response in this format:\n"
            #     "```json\n"
            #     "{\n"
            #     '    "reasoning": "detailed analysis process",\n'
            #     '    "answer": "your answer",\n'
            #     "}\n"
            #     "```"
            # )
            initial_prompt = "Please provide a detailed description of the image directly. If there are multiple images, please explain the differences and the related aspects between them. Do not provide or infer information that is not present in the images."
            vl_inputs = [
                {"type": "text", "value": initial_prompt}
            ]
            for image in image_path:
                vl_inputs.append({"type": "image", "value": image})
            initial_status = -1
            initial_retry = 0
            while initial_retry < self.retry:
                initial_status, initial_response, _ = self.vl_api.generate_inner(vl_inputs)
                if initial_status == 0:
                    break
                initial_retry += 1
                time.sleep(1)
                print(f'Initial response failed, retrying...')
            # yincang
            # visual_info_dict, _ = self.parse_json_text_with_remaining(initial_response)
            # vlm_initial_answer = visual_info_dict.get('answer', '') if visual_info_dict else ''
            # vlm_initial_reasoning = visual_info_dict.get('reasoning', '') if visual_info_dict else initial_response
        else:
            # yincang
            # vlm_initial_answer = ''
            # vlm_initial_reasoning = ''
            initial_response = ''

        qa_process["process"].append({
            "step": "initial_vision",
            "initial_prompt": initial_prompt,
            "initial_answer": initial_response,
        })

        # 3. 迭代追问
        max_iterations = self.round
        current_iteration = 0
        analysis_history = ''
        for _ in range(max_iterations):
            iter_info = {
                "iteration": current_iteration + 1,
            }
           
            # yincang
            # llm_prompt = (
            #     f"Question: {text}\n\n"
            #     f"Initial Reasoning: {vlm_initial_reasoning}\n"
            #     f"Initial Answer: {vlm_initial_answer}\n"
            #     f"{analysis_history}\n"
            #     "Please question the above answer and reasoning process, and raise 3 questions that require VLM to conduct more detailed image analysis.\n\n"
            #     "Please return the questions in the following JSON format:\n"
            #     "```json\n"
            #     "{\n"
            #     '    "question1": "first question",\n'
            #     '    "question2": "second question",\n'
            #     '    "question3": "third question"\n'
            #     "}\n"
            #     "```"
            # )
            llm_prompt = (
                f"Question: {text}\n\n"
                f"Image Descriptions: {initial_response}\n"
                f"{analysis_history}\n"
                "Please question whether the parts of the above image descriptions that are most relevant to the problem are correct and whether more visual information needs to be added. Based on the above problem and image descriptions, pose 3 questions that require further image analysis by a VLM.\n\n"
                "Please return the questions in the following JSON format:\n"
                "```json\n"
                "{\n"
                '    "question1": "first question",\n'
                '    "question2": "second question",\n'
                '    "question3": "third question"\n'
                "}\n"
                "```"
            )
            iter_info["llm_prompt"] = llm_prompt
            llm_status = -1
            llm_retry = 0
            while llm_retry < self.retry:
                llm_status, llm_response, _ = self.lm_api.generate_inner([llm_prompt])
                if llm_status == 0:
                    break
                llm_retry += 1
                time.sleep(1)
                print(f'LLM response failed, retrying...')
            questions_dict, _ = self.parse_json_text_with_remaining(llm_response)
            questions = []
            if questions_dict:
                questions = [questions_dict.get(f'question{i}', '') for i in range(1, 4) if questions_dict.get(f'question{i}', '')]
            iter_info["3_questions"] = llm_response
            follow_up_answers = []
            for sub_question in questions:
                follow_up_prompt = (
                    "You are a professional visual analysis assistant. Please provide your analysis in JSON format with two parts:\n\n"
                    "1. reasoning: Detailed analysis considering:\n"
                    "   - Main visual elements in the image(s)\n"
                    "   - Spatial relationships between these elements\n"
                    "   - Any details that might help answer the question\n"
                    "   - If multiple images, describe each and their relationships\n"
                    "2. answer: Direct response to the question\n\n"
                    f"Question: {sub_question}\n\n"
                    "Please return your response in this format:\n"
                    "```json\n"
                    "{\n"
                    '    "reasoning": "detailed analysis process",\n'
                    '    "answer": "your answer",\n'
                    "}\n"
                    "```"
                )
                vl_inputs = [
                    {"type": "text", "value": follow_up_prompt}
                ]
                for image in image_path:
                    vl_inputs.append({"type": "image", "value": image})
                vlm_status = -1
                vlm_retry = 0
                while vlm_retry < self.retry:
                    vlm_status, vlm_response, _ = self.vl_api.generate_inner(vl_inputs)
                    if vlm_status == 0:
                        break
                    vlm_retry += 1
                    time.sleep(1)
                    print(f'VLM response failed, retrying...')
                response_dict, _ = self.parse_json_text_with_remaining(vlm_response)
                vlm_followed_answer = response_dict.get('answer', '') if response_dict else ''
                vlm_followed_reasoning = response_dict.get('reasoning', '') if response_dict else vlm_response
                follow_up_answers.append({
                    'focus_point': sub_question,
                    'answer': vlm_followed_answer,
                    'reasoning': vlm_followed_reasoning
                })
            iter_info["vlm_analysis"] = follow_up_answers
            
            # 记录当前迭代信息
            qa_process["process"].append(iter_info)

            # 更新分析历史
            current_round_analysis = f"\nRound {current_iteration + 1} Analysis:\n"
            for i, answer in enumerate(follow_up_answers, 1):
                current_round_analysis += (
                    f"Focus Point {i}: {answer['focus_point']}\n"
                    f"Reasoning Process: {answer['reasoning']}\n"
                    f"Analysis Result: {answer['answer']}\n\n"
                )
            analysis_history += current_round_analysis
            current_iteration += 1

        # yincang
        # 4. 生成最终答案
        # final_prompt = (
        #     "Please answer the question through clear reasoning steps based on all the following visual information.\n\n"
        #     "Reasoning Steps:\n"
        #     "1. Information Integration:\n"
        #     "   - Organize all visual observations\n"
        #     "   - Identify key information points\n"
        #     "   - Establish connections between information\n\n"
        #     "2. Logical Reasoning:\n"
        #     "   - Conduct reasoning based on integrated information\n"
        #     "   - Explain the reasoning process\n"
        #     "   - Demonstrate conclusion reliability\n\n"
        #     f"Question: {text}\n\n"
        #     f"Initial Reasoning: {vlm_initial_reasoning}\n"
        #     f"Initial Answer: {vlm_initial_answer}\n"
        #     f"{analysis_history}\n"
        # )

        final_prompt = (
            "Please answer the question through clear reasoning steps based on all the following visual information.\n\n"
            "Reasoning Steps:\n"
            "1. Information Integration:\n"
            "   - Organize all visual observations\n"
            "   - Identify key information points\n"
            "   - Establish connections between information\n\n"
            "2. Logical Reasoning:\n"
            "   - Conduct reasoning based on integrated information\n"
            "   - Explain the reasoning process\n"
            "   - Demonstrate conclusion reliability\n\n"
            f"Question: {text}\n\n"
            f"Image Descriptions: {initial_response}\n"
            f"{analysis_history}\n"
        )
        final_status = -1
        final_retry = 0
        while final_retry < self.retry:
            final_status, final_answer, _ = self.lm_api.generate_inner([final_prompt])
            if final_status == 0:
                break
            final_retry += 1
            time.sleep(1)
            print(f'Final response failed, retrying...')
        
        qa_process["process"].append({
            "step": "final_answer",
            "final_prompt": final_prompt,
            "final_answer": final_answer,
        })
        # 记录完整的QA过程
        dir_name = 'MathVista_QwenVLMaxQwen3_9_22'
        os.makedirs(dir_name, exist_ok=True)
        with open(f'{dir_name}/{image_path[0].split("/")[-1]}_{qa_process["timestamp"]}.json', 'w') as f:
            json.dump(qa_process, f, indent=4, ensure_ascii=False)

        return 0, final_answer, 'Succeeded!'
