import logging
import yaml
import sys
import os
from pathlib import Path

from google import genai

from utils_api import (
    OPENAI_MODELS,
    ANTHROPIC_MODELS,
    GOOGLE_MODELS,
    QWEN_MODELS,
    INTERNVL_MODELS,
    MIMO_MODELS,
)

from utils import (
    encode_image,
    sample_frame,
)


def get_duration(start: float, end: float):
    duration = end - start
    mm = int(duration // 60)
    ss = int(duration % 60)

    return f"{mm}:{ss}"


def load_prompt_template(filepath):
    logging.info("Load prompt template ...")
    with open(filepath, "r") as f:
        template_components = yaml.safe_load(f)

    return template_components


def format_benchmark_openai_response_api(
    args, example, template_components, instructions
):
    """
    Default benchmarking prompt format for Response API
    """
    content, text_prompt = [], ""

    # prefix
    content.append({"type": "input_text", "text": template_components["prefix"]})
    text_prompt += template_components["prefix"] + "\n"

    # parts
    content.append({"type": "input_text", "text": template_components["parts"]})
    text_prompt += template_components["parts"] + "\n"
    filepath_parts = args.dirpath_parts_image / f"{example['toy_id']}-all_360p.png"
    content.append(
        {
            "type": "input_image",
            "image_url": f"data:image/png;base64,{encode_image(filepath_parts)}",
        }
    )
    text_prompt += f"<{str(filepath_parts)}>\n"

    # assembly manual
    content.append(
        {
            "type": "input_text",
            "text": template_components["manual"]["dot"].replace(
                "{dot}", instructions["dot"]
            ),
        }
    )
    text_prompt += (
        template_components["manual"]["dot"].replace("{dot}", instructions["dot"]) + "\n"
    )

    # recording
    content.append({"type": "input_text", "text": template_components["recording"]})
    text_prompt += template_components["recording"] + "\n"
    sampled_frames = sample_frame(
        args.dirpath_frame / example["sequence_id"] / args.angle,
        example["video"]["start"],
        example["video"]["end"],
        args.max_frames,
    )
    for filepath in sampled_frames:
        content.append(
            {
                "type": "input_image",
                "image_url": f"data:image/png;base64,{encode_image(filepath)}",
            }
        )
        text_prompt += f"<{str(filepath)}>\n"

    # task
    content.append(
        {
            "type": "input_text",
            "text": template_components["task"].replace(
                "{question}", example["question"]
            ),
        }
    )
    text_prompt += template_components["task"].replace("{question}", example["question"])

    return content, text_prompt.strip()


def format_benchmark_anthropic_api(args, example, template_components, instructions):
    """
    Default benchmarking prompt format for
    """
    content, content_text = [], ""

    # prefix
    template_prefix = template_components["prefix"]
    content.append({"type": "text", "text": template_prefix})
    content_text += template_prefix + "\n"

    # parts
    template_parts = template_components["parts"]
    content.append({"type": "text", "text": template_parts})
    content_text += template_parts + "\n"
    filepath_parts = args.dirpath_parts_image / f"{example['toy_id']}-all_360p.png"
    src = {
        "type": "base64",
        "media_type": "image/png",
        "data": encode_image(filepath_parts),
    }
    content.append({"type": "image", "source": src})
    content_text += f"<{str(filepath_parts)}>\n"

    # assembly manual
    template_manual = template_components["manual"]["dot"]
    template_manual = template_manual.replace("{dot}", instructions["dot"])
    content.append({"type": "text", "text": template_manual})
    content_text += template_manual + "\n"

    # recording
    template_recording = template_components["recording"]
    content.append({"type": "text", "text": template_recording})
    content_text += template_recording + "\n"
    sampled_frames = sample_frame(
        args.dirpath_frame / example["sequence_id"] / args.angle,
        example["video"]["start"],
        example["video"]["end"],
        args.max_frames,
    )
    for filepath in sampled_frames:
        src = {
            "type": "base64",
            "media_type": "image/png",
            "data": encode_image(filepath),
        }
        content.append({"type": "image", "source": src})
        content_text += f"<{str(filepath)}>\n"

    # task
    template_task = template_components["task"]
    template_task = template_task.replace("{question}", example["question"])
    content.append({"type": "text", "text": template_task})
    content_text += template_task + "\n"

    return content, content_text.strip()


def format_benchmark_google_gemini_api(args, example, template_components, instructions):
    client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

    content = []
    text_prompt = ""
    uploaded_files = []

    # prefix
    content.append(template_components["prefix"])
    text_prompt += template_components["prefix"] + "\n"

    # parts
    content.append(template_components["parts"])
    text_prompt += template_components["parts"] + "\n"
    filepath_parts = args.dirpath_parts_image / f"{example['toy_id']}-all_360p.png"
    ref_filepath_parts = client.files.upload(file=filepath_parts)
    content.append(ref_filepath_parts)
    uploaded_files.append(ref_filepath_parts)
    text_prompt += f"<{str(filepath_parts)}>\n"

    # assembly manual
    manual = template_components["manual"]["dot"]
    manual = manual.replace("{dot}", instructions["dot"])
    content.append(manual)
    text_prompt += manual + "\n"

    # recording
    content.append(template_components["recording"])
    text_prompt += template_components["recording"] + "\n"
    sampled_frames = sample_frame(
        args.dirpath_frame / example["sequence_id"] / args.angle,
        example["video"]["start"],
        example["video"]["end"],
        args.max_frames,
    )
    for filepath in sampled_frames:
        ref_filepath = client.files.upload(file=filepath)
        content.append(ref_filepath)
        uploaded_files.append(ref_filepath)
        text_prompt += f"<{str(filepath)}>\n"

    # task
    content.append(template_components["task"].replace("{question}", example["question"]))
    text_prompt += (
        template_components["task"].replace("{question}", example["question"]) + "\n"
    )

    return content, text_prompt.strip(), uploaded_files


def format_benchmark_qwen_vllm_server(args, example, template_components, instructions):
    """
    Default benchmarking prompt format for Qwen's vLLM server mode
    """
    content, text_prompt = [], ""

    # prefix
    text_prefix = template_components["prefix"]
    content.append({"type": "text", "text": text_prefix})
    text_prompt += text_prefix + "\n"

    # parts
    content.append({"type": "text", "text": template_components["parts"]})
    text_prompt += template_components["parts"] + "\n"
    filepath_parts = args.dirpath_parts_image / f"{example['toy_id']}-all_360p.png"
    file_data = f"data:image/png;base64,{encode_image(filepath_parts)}"
    content.append({"type": "image_url", "image_url": {"url": file_data}})
    text_prompt += f"<{str(filepath_parts)}>\n"

    # assembly manual
    text_manual = template_components["manual"]["dot"].replace(
        "{dot}", instructions["dot"]
    )
    content.append({"type": "text", "text": text_manual})
    text_prompt += text_manual + "\n"

    # recording
    content.append({"type": "text", "text": template_components["recording"]})
    text_prompt += template_components["recording"] + "\n"
    sampled_frames = sample_frame(
        args.dirpath_frame / example["sequence_id"] / args.angle,
        example["video"]["start"],
        example["video"]["end"],
        args.max_frames,
    )
    for filepath in sampled_frames:
        file_data = f"data:image/png;base64,{encode_image(filepath)}"
        content.append({"type": "image_url", "image_url": {"url": file_data}})
        text_prompt += f"<{str(filepath)}>\n"

    # task
    if args.reasoning:
        # use cot
        text_task = template_components["task-cot"].replace(
            "{question}", example["question"]
        )
    else:
        text_task = template_components["task"].replace("{question}", example["question"])
    content.append({"type": "text", "text": text_task})
    text_prompt += text_task

    return content, text_prompt.strip()


def format_benchmark_mimo_vllm_server(args, example, template_components, instructions):
    """
    Default benchmarking prompt format for MiMo's vLLM server mode
    """
    content, text_prompt = [], ""

    # prefix
    text_prefix = template_components["prefix"]
    content.append({"type": "text", "text": text_prefix})
    text_prompt += text_prefix + "\n"

    # parts
    content.append({"type": "text", "text": template_components["parts"]})
    text_prompt += template_components["parts"] + "\n"
    filepath_parts = args.dirpath_parts_image / f"{example['toy_id']}-all_360p.png"
    file_data = f"data:image/png;base64,{encode_image(filepath_parts)}"
    content.append({"type": "image_url", "image_url": {"url": file_data}})
    text_prompt += f"<{str(filepath_parts)}>\n"

    # assembly manual
    text_manual = template_components["manual"]["dot"].replace(
        "{dot}", instructions["dot"]
    )
    content.append({"type": "text", "text": text_manual})
    text_prompt += text_manual + "\n"

    # recording
    content.append({"type": "text", "text": template_components["recording"]})
    text_prompt += template_components["recording"] + "\n"
    sampled_frames = sample_frame(
        args.dirpath_frame / example["sequence_id"] / args.angle,
        example["video"]["start"],
        example["video"]["end"],
        args.max_frames,
    )
    for filepath in sampled_frames:
        file_data = f"data:image/png;base64,{encode_image(filepath)}"
        content.append({"type": "image_url", "image_url": {"url": file_data}})
        text_prompt += f"<{str(filepath)}>\n"

    # task
    text_task = template_components["task"].replace("{question}", example["question"])
    ## reasoning or not
    if args.reasoning:
        pass
    else:
        text_task += "/no_think"
    content.append({"type": "text", "text": text_task})
    text_prompt += text_task

    return content, text_prompt.strip()


def format_benchmark_input(args, example, template_components, instructions):
    uploaded_files = []
    match args.model_id:
        case x if x in OPENAI_MODELS:
            content, text_prompt = format_benchmark_openai_response_api(
                args, example, template_components, instructions
            )
        case x if x in ANTHROPIC_MODELS:
            content, text_prompt = format_benchmark_anthropic_api(
                args, example, template_components, instructions
            )
        case x if x in GOOGLE_MODELS:
            content, text_prompt, uploaded_files = format_benchmark_google_gemini_api(
                args, example, template_components, instructions
            )
        case x if x in QWEN_MODELS + INTERNVL_MODELS:
            content, text_prompt = format_benchmark_qwen_vllm_server(
                args, example, template_components, instructions
            )
        case x if x in MIMO_MODELS:
            content, text_prompt = format_benchmark_mimo_vllm_server(
                args, example, template_components, instructions
            )
        case _:
            sys.exit(f"Undefined (format_input) {args.model_id}")

    return content, text_prompt, uploaded_files


def format_initial_user_message(
    model_id, template_components, example, pre_sample=False, args=None
):
    message, text_prompt = None, ""
    match model_id:
        case x if x in OPENAI_MODELS:
            new_content = (
                template_components["user"]["initial"]
                .replace("{question}", example["question"])
                .replace(
                    "{duration}",
                    get_duration(example["video"]["start"], example["video"]["end"]),
                )
            )
            message = {"role": "user", "content": new_content}
            text_prompt = new_content
        case x if x in ANTHROPIC_MODELS:
            new_content = (
                template_components["user"]["initial"]
                .replace("{question}", example["question"])
                .replace(
                    "{duration}",
                    get_duration(example["video"]["start"], example["video"]["end"]),
                )
            )
            message = {"role": "user", "content": new_content}
            text_prompt = new_content
        case x if x in GOOGLE_MODELS:
            new_content = (
                template_components["user"]["initial"]
                .replace("{question}", example["question"])
                .replace(
                    "{duration}",
                    get_duration(example["video"]["start"], example["video"]["end"]),
                )
            )
            message = genai.types.Content(
                role="user", parts=[genai.types.Part(text=new_content)]
            )
            text_prompt = new_content
        case x if x in QWEN_MODELS + MIMO_MODELS + INTERNVL_MODELS:
            new_content = (
                template_components["user"]["initial"]
                .replace("{question}", example["question"])
                .replace(
                    "{duration}",
                    get_duration(example["video"]["start"], example["video"]["end"]),
                )
            )
            message = {"role": "user", "content": new_content}
            text_prompt = new_content
        case _:
            logging.error(f"Undefined (format_initial_user_message) {model_id=}")
            pass

    return message, text_prompt


# todo, refactoring idea: add eval prompt here


def format_input_tcot_selection_google(args, example, segment, template_components):
    client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

    content, content_text = [], ""
    files_uploaded = {}

    # 1. prefix
    text_prefix = template_components["selection"]["prefix"]
    content.append(text_prefix)
    content_text = text_prefix + "\n"

    # 2. files
    for idx, filepath in enumerate(segment):
        content.append(f"FrameID {idx+1}:")
        content_text += f"FrameID {idx+1}:"

        file_uploaded = client.files.upload(file=filepath)
        content.append(file_uploaded)
        content_text += f"{str(filepath)}\n"

        files_uploaded[idx + 1] = {"filepath": str(filepath), "object": file_uploaded}

    # 3. main
    text_main = template_components["selection"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append(text_main)
    content_text += text_main + "\n"

    # 4. format
    text_format = template_components["selection"]["format"]
    content.append(text_format)
    content_text += text_format + "\n"

    contents = [content]
    contents_text = [["user", content_text]]

    return contents, contents_text, files_uploaded


def format_input_tcot_selection_openai(args, example, segment, template_components):
    content, content_text = [], ""
    id2filepath = {}

    # 1. prefix
    text_prefix = template_components["selection"]["prefix"]
    content.append({"type": "input_text", "text": text_prefix})
    content_text += text_prefix + "\n"

    # 2. files
    for idx, filepath in enumerate(segment):
        content.append({"type": "input_text", "text": f"FrameID {idx+1}:"})
        content_text += f"FrameID {idx+1}:"

        image_url = f"data:image/jpeg;base64,{encode_image(filepath)}"
        content.append({"type": "input_image", "image_url": image_url})
        content_text += f"{str(filepath)}\n"

        id2filepath[idx + 1] = str(filepath)

    # 3. main
    text_main = template_components["selection"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append({"type": "input_text", "text": text_main})
    content_text += text_main + "\n"

    # 4. format
    text_format = template_components["selection"]["format"]
    content.append({"type": "input_text", "text": text_format})
    content_text += text_format + "\n"

    messages = [{"role": "user", "content": content}]
    messages_text = [["user", content_text]]

    return messages, messages_text, id2filepath


def format_input_tcot_selection_anthropic(args, example, segment, template_components):
    content, content_text = [], ""
    id2filepath = {}

    # 1. prefix
    text_prefix = template_components["selection"]["prefix"]
    content.append({"type": "text", "text": text_prefix})
    content_text += text_prefix + "\n"

    # 2. files
    for idx, filepath in enumerate(segment):
        content.append({"type": "text", "text": f"FrameID {idx+1}:"})
        content_text += f"FrameID {idx+1}:"

        src = {
            "type": "base64",
            "media_type": "image/png",
            "data": encode_image(filepath),
        }
        content.append({"type": "image", "source": src})
        content_text += f"{str(filepath)}\n"

        id2filepath[idx + 1] = str(filepath)

    # 3. main
    text_main = template_components["selection"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append({"type": "text", "text": text_main})
    content_text += text_main + "\n"

    # 4. format
    text_format = template_components["selection"]["format"]
    content.append({"type": "text", "text": text_format})
    content_text += text_format + "\n"

    messages = [{"role": "user", "content": content}]
    messages_text = [["user", content_text]]

    return messages, messages_text, id2filepath


def format_input_tcot_selection_qwen(args, example, segment, template_components):
    content, content_text = [], ""
    id2filepath = {}

    # 1. prefix
    text_prefix = template_components["selection"]["prefix"]
    content.append({"type": "text", "text": text_prefix})
    content_text += text_prefix + "\n"

    # 2. files
    for idx, filepath in enumerate(segment):
        content.append({"type": "text", "text": f"FrameID {idx+1}:"})
        content_text += f"FrameID {idx+1}:"

        image_url = f"data:image/jpeg;base64,{encode_image(filepath)}"
        content.append({"type": "image_url", "image_url": {"url": image_url}})
        content_text += f"{str(filepath)}\n"

        id2filepath[idx + 1] = str(filepath)

    # 3. main
    text_main = template_components["selection"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append({"type": "text", "text": text_main})
    content_text += text_main + "\n"

    # 4. format
    text_format = template_components["selection"]["format"]
    content.append({"type": "text", "text": text_format})
    content_text += text_format + "\n"

    messages = [{"role": "user", "content": content}]
    messages_text = [["user", content_text]]

    return messages, messages_text, id2filepath


def format_input_tcot_selection(args, example, segment, template_components):
    outputs, outputs_text = None, ""
    files = []
    match args.model_id:
        case x if x in GOOGLE_MODELS:
            outputs, outputs_text, files = format_input_tcot_selection_google(
                args, example, segment, template_components
            )
        case x if x in OPENAI_MODELS:
            outputs, outputs_text, files = format_input_tcot_selection_openai(
                args, example, segment, template_components
            )
        case x if x in ANTHROPIC_MODELS:
            outputs, outputs_text, files = format_input_tcot_selection_anthropic(
                args, example, segment, template_components
            )
        case x if x in QWEN_MODELS + INTERNVL_MODELS + MIMO_MODELS:
            outputs, outputs_text, files = format_input_tcot_selection_qwen(
                args, example, segment, template_components
            )
        case _:
            logging.error(f"Undefined {args.model_id=} (format_input_tcot_selection)")

    return outputs, outputs_text, files


def format_input_tcot_answer_google(
    args, example, frames_input, template_components, instructions
):
    client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

    content, content_text = [], ""
    files_uploaded = []

    # 1. prefix
    text_prefix = template_components["answer"]["open-ended"]["prefix"]
    content.append(text_prefix)
    content_text = text_prefix + "\n"

    # 2. frames w/ id
    for filepath in frames_input:
        second = int(filepath.stem)
        second_formatted = f"{(second-int(example['video']['start'])):04d}"
        content.append(f"Frame: {second_formatted}")
        content.append(filepath)
        content_text += f"Frame: {second_formatted}, {str(filepath)}\n"

    # 3. parts
    text_parts = template_components["answer"]["open-ended"]["parts"]
    content.append(text_parts)
    content_text += text_parts + "\n"

    filepath_parts = instructions["parts"]
    file_parts_uploaded = client.files.upload(file=filepath_parts)
    content.append(file_parts_uploaded)
    files_uploaded.append(file_parts_uploaded)
    content_text += f"[{filepath_parts}]\n"

    # 4. instruction
    text_instruction = template_components["answer"]["open-ended"]["instruction"]
    text_instruction = text_instruction.replace("{dot}", instructions["dot"])
    content.append(text_instruction)
    content_text += text_instruction + "\n"

    # 5. main
    text_main = template_components["answer"]["open-ended"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append(text_main)
    content_text += text_main + "\n"

    contents = [content]
    contents_text = [["user", content_text]]

    return contents, contents_text, files_uploaded


def format_input_tcot_answer_openai(
    args, example, frames_input, template_components, instructions
):
    content, content_text = [], ""

    # 1. prefix
    text_prefix = template_components["answer"]["open-ended"]["prefix"]
    content.append({"type": "input_text", "text": text_prefix})
    content_text += text_prefix + "\n"

    # 2. frames
    for filepath in frames_input:
        second = int(Path(filepath).stem)
        second_formatted = f"{(second-int(example['video']['start'])):04d}"
        content.append({"type": "input_text", "text": f"Frame: {second_formatted}"})
        image_url = f"data:image/jpeg;base64,{encode_image(filepath)}"
        content.append({"type": "input_image", "image_url": image_url})
        content_text += f"Frame: {second_formatted}, {str(filepath)}\n"

    # 3. parts
    text_parts = template_components["answer"]["open-ended"]["parts"]
    content.append({"type": "input_text", "text": text_parts})
    content_text += text_parts + "\n"

    filepath_parts = instructions["parts"]
    image_url = f"data:image/jpeg;base64,{encode_image(filepath_parts)}"
    content.append({"type": "input_image", "image_url": image_url})
    content_text += f"{str(filepath_parts)}\n"

    # 4. instruction
    text_instruction = template_components["answer"]["open-ended"]["instruction"]
    text_instruction = text_instruction.replace("{dot}", instructions["dot"])
    content.append({"type": "input_text", "text": text_instruction})
    content_text += text_instruction + "\n"

    # 5. main
    text_main = template_components["answer"]["open-ended"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append({"type": "input_text", "text": text_main})
    content_text += text_main + "\n"

    messages = [{"role": "user", "content": content}]
    messages_text = [["user", content_text]]

    return messages, messages_text


def format_input_tcot_answer_anthropic(
    args, example, frames_input, template_components, instructions
):
    content, content_text = [], ""

    # 1. prefix
    text_prefix = template_components["answer"]["open-ended"]["prefix"]
    content.append({"type": "text", "text": text_prefix})
    content_text += text_prefix + "\n"

    # 2. frames
    for filepath in frames_input:
        second = int(Path(filepath).stem)
        second_formatted = f"{(second-int(example['video']['start'])):04d}"
        content.append({"type": "text", "text": f"Frame: {second_formatted}"})

        src = {
            "type": "base64",
            "media_type": "image/png",
            "data": encode_image(filepath),
        }
        content.append({"type": "image", "source": src})
        content_text += f"Frame: {second_formatted}, {str(filepath)}\n"

    # 3. parts
    text_parts = template_components["answer"]["open-ended"]["parts"]
    content.append({"type": "text", "text": text_parts})
    content_text += text_parts + "\n"

    filepath_parts = instructions["parts"]
    src = {
        "type": "base64",
        "media_type": "image/png",
        "data": encode_image(filepath_parts),
    }
    content.append({"type": "image", "source": src})
    content_text += f"{str(filepath_parts)}\n"

    # 4. instruction
    text_instruction = template_components["answer"]["open-ended"]["instruction"]
    text_instruction = text_instruction.replace("{dot}", instructions["dot"])
    content.append({"type": "text", "text": text_instruction})
    content_text += text_instruction + "\n"

    # 5. main
    text_main = template_components["answer"]["open-ended"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append({"type": "text", "text": text_main})
    content_text += text_main + "\n"

    messages = [{"role": "user", "content": content}]
    messages_text = [["user", content_text]]

    return messages, messages_text


def format_input_tcot_answer_qwen(
    args, example, frames_input, template_components, instructions
):
    content, content_text = [], ""

    # 1. prefix
    text_prefix = template_components["answer"]["open-ended"]["prefix"]
    content.append({"type": "text", "text": text_prefix})
    content_text += text_prefix + "\n"

    # 2. frames
    for filepath in frames_input:
        second = int(Path(filepath).stem)
        second_formatted = f"{(second-int(example['video']['start'])):04d}"
        content.append({"type": "text", "text": f"Frame: {second_formatted}"})
        image_url = f"data:image/jpeg;base64,{encode_image(filepath)}"
        content.append({"type": "image_url", "image_url": {"url": image_url}})
        content_text += f"Frame: {second_formatted}, {str(filepath)}\n"

    # 3. parts
    text_parts = template_components["answer"]["open-ended"]["parts"]
    content.append({"type": "text", "text": text_parts})
    content_text += text_parts + "\n"

    filepath_parts = instructions["parts"]
    image_url = f"data:image/jpeg;base64,{encode_image(filepath_parts)}"
    content.append({"type": "image_url", "image_url": {"url": image_url}})
    content_text += f"{str(filepath_parts)}\n"

    # 4. instruction
    text_instruction = template_components["answer"]["open-ended"]["instruction"]
    text_instruction = text_instruction.replace("{dot}", instructions["dot"])
    content.append({"type": "text", "text": text_instruction})
    content_text += text_instruction + "\n"

    # 5. main
    text_main = template_components["answer"]["open-ended"]["main"]
    text_main = text_main.replace("{question}", example["question"])
    content.append({"type": "text", "text": text_main})
    content_text += text_main + "\n"

    messages = [{"role": "user", "content": content}]
    messages_text = [["user", content_text]]

    return messages, messages_text


def format_input_tcot_answer(
    args, example, frames_input, template_components, instructions
):
    outputs, outputs_text = None, ""
    files_uploaded = []
    match args.model_id:
        case x if x in GOOGLE_MODELS:
            outputs, outputs_text, files_uploaded = format_input_tcot_answer_google(
                args, example, frames_input, template_components, instructions
            )
        case x if x in OPENAI_MODELS:
            outputs, outputs_text = format_input_tcot_answer_openai(
                args, example, frames_input, template_components, instructions
            )
        case x if x in ANTHROPIC_MODELS:
            outputs, outputs_text = format_input_tcot_answer_anthropic(
                args, example, frames_input, template_components, instructions
            )
        case x if x in QWEN_MODELS + INTERNVL_MODELS + MIMO_MODELS:
            outputs, outputs_text = format_input_tcot_answer_qwen(
                args, example, frames_input, template_components, instructions
            )
        case _:
            logging.error(f"Undefined {args.model_id=} (format_input_tcot_selection)")

    return outputs, outputs_text, files_uploaded
