import os
import json
import base64
import random
import shortuuid
from openai import OpenAI
import argparse
from tqdm import tqdm
from llava.utils import disable_torch_init

PROMPTS_PER_CATEGORIES_EGO = {
    "action": (
        "\nInstructions:\n"
        "Each question must focus on my actions, body posture, or gestures.\n"
        "The answer must be a verb or verb phrase (e.g., writing, stretching, crossing arms).\n"
        "Do not generate QA pairs with overly generic answers like 'standing' or 'reaching'."
        "Question Categories & Templates:\n"
        "Actions (What am I doing?)\n"
        "- What am I doing?\n"
        "- What am I doing with my [body part]?\n"
        "\n"
        "Body Posture (How am I positioned?)\n"
        "- How is my body positioned?\n"
        "- How am I sitting/standing/lying?\n"
        "- What is my posture?\n"
        "\n"
        "Gestures (What movement am I making?)\n"
        "- What am I doing with my hands?\n"
        "- What gesture am I making?\n"
        "- How am I moving my arms/legs/head?\n"
        "\nExamples:\n"
        "Q: How is my body positioned?\n"
        "A: Sitting cross-legged\n\n"
        "Q: What am I doing with my left hand?\n"
        "A: Holding a book\n\n"
        "Q: What gesture am I making?\n"
        "A: Waving\n\n"

    ),
    "object": (
        "\nInstructions:\n"
        "Each question must focus on identifying a specific object (e.g., 'mug cup', 'laptop') or describing an attribute of an object (e.g., 'navy blue', 'striped pattern') associated with me.\n"
        "The answer must be a noun or noun phrase, avoiding overly generic responses such as 'something' or 'object'.\n"
        "Question Categories & Templates:\n"
        "Object Identification (What am I interacting with?)\n"
        "- What am I holding?\n"
        "- What object is on the table beside me?\n"
        "- Which item am I picking up?\n"
        "\n"
        "Object Attributes (What does it look like?)\n"
        "- What color is the shirt I am wearing?\n"
        "- What pattern is on my jacket?\n"
        "- What type of shoes am I wearing?\n"
        "\n"
        "\nExamples:\n"
        "Q: What color is the shirt I am wearing?\n"
        "A: Navy blue\n\n"
        "Q: Which object am I holding in my right hand?\n"
        "A: A small notebook\n\n"
        "Q: What pattern does my sweater have?\n"
        "A: Checkered pattern\n\n"
    ),

    "numerical": (
        "\nInstructions:\n"
        "Each question must focus on numerical reasoning by counting or quantifying specific elements directly related to me."
        "This may include the number of people, objects, or other countable items present in my surroundings.\n"
        "The answer must be a numerical value that accurately represents the count of the indicated elements.\n"
        "Do not generate questions about overly generic objects (e.g., items, objects).\n"
        "All numerical answers must be within the range of 0 to 5.\n"
        "Question Categories & Templates:\n"
        "Counting People (How many people are around me?)\n"
        "- How many people are in the image excluding me?\n"
        "- How many individuals are facing the same direction as I am?\n"
        "\n"
        "Counting Objects (How many things are near or with me?)\n"
        "- How many [objects] am I holding?\n"
        "- How many [items] are on the table beside me?\n"
        "\n"
        "Quantitative Comparisons (How do the numbers compare to what I have?)\n"
        "- How many more books are on my desk than on the shelf?\n"
        "- By how much does the number of items in my hands exceed the number on the table?\n"
        "\nExamples:\n"
        "Q: How many people are in the image excluding me?\n"
        "A: 3\n\n"
        "Q: How many more bowls are on my table compared to the table behind me?\n"
        "A: 2\n\n"
        "Q: How many apples am I holding?\n"
        "A: 3\n\n"
    ),

    "spatial": (
        "\nInstructions:\n"
        "Each question must focus on the spatial relationships between me and objects in my surroundings.\n"
        "The answer must be a specific object or location descriptor (e.g., coffee cup, bookshelf, under the table).\n"
        "Do not generate QA pairs with overly generic answers."
        "Question Categories & Templates:\n"
        "Object Proximity (What is closest or farthest?):\n"
        "- What object is closest to me?\n"
        "- Which object is the farthest from me?\n"
        "- What is the nearest object to my [body part]?\n\n"
        "Relative Positioning (Where are objects located?)\n"
        "- What object is to my left/right/front/behind?\n"
        "- Which object is above/below me?\n"
        "Spatial Relations (How are objects arranged?)\n"
        "- Which object is between me and [another object]?\n"
        "\nExamples:\n"
        "Q: What object is closest to my left hand?\n"
        "A: Coffee cup\n\n"
        "Q: Which object is the farthest from me?\n"
        "A: Bookshelf\n\n"
        "Q: What object is on my right side?\n"
        "A: Tissue\n\n"
    )
}

PROMPTS_PER_CATEGORIES_EXO = {
    "action": (
        "\nInstructions:\n"
        "Each question must focus on the actions, body posture, or gestures within the scene.\n"
        "The answer must be a verb or verb phrase (e.g., writing, stretching, crossing arms).\n"
        "Do not generate QA pairs with overly generic answers like 'standing' or 'reaching'."
        "Question Categories & Templates:\n"
        "Actions (What is the person doing?)\n"
        "- What is the [descriptive] person doing?\n"
        "- What is the [descriptive] person doing with their [body part]?\n"
        "Body Posture (How is the person positioned?)\n"
        "- How is the [descriptive] person positioned?\n"
        "- What is the posture of the [descriptive] person?\n"
        "Gestures (What movements is the person making?)\n"
        "- What kind of gesture is the [descriptive] person making?\n"
        "- How is the [descriptive] person moving their arms/legs/head?\n"
        "\nExamples:\n"
        "Q: What is the man sitting in the chair doing?\n"
        "A: Watching a phone\n\n"
        "Q: What is the posture of the person wearing a green shirt?\n"
        "A: Raising one arm\n\n"
        "Q: What is the woman in the black jacket doing with their right hand?\n"
        "A: Holding a book\n\n"
    ),

    "object": (
        "\nInstructions:\n"
        "Each question must focus on identifying a specific object in the scene (e.g., 'mug cup', 'laptop') or describing an attribute of an object (e.g., 'navy blue', 'striped pattern').\n"
        "Questions should reference people or objects by descriptors (e.g., 'the woman in the white top', 'the man with the striped shirt').\n"
        "The answer must be a noun or noun phrase, avoiding overly generic responses such as 'something' or 'object'.\n"
        "Question Categories & Templates:\n"
        "Object Identification (What is present?)\n"
        "- What is the man with the striped shirt holding?\n"
        "- What object is placed on the table?\n"
        "- Which item is the woman wearing blue top picking up?\n"
        "\n"
        "Object Attributes (What does it look like?)\n"
        "- What color is the shirt worn by the man wearing a cap?\n"
        "- What pattern is on the jacket worn by the woman carrying a handbag?\n"
        "- What type of shoes is the man standing near the window wearing?\n"
        "\n"
        "\nExamples:\n"
        "Q: What color is the top worn by the woman holding the towel?\n"
        "A: White\n\n"
        "Q: Which object is the man in the black shirt holding in his right hand?\n"
        "A: Smartphone\n\n"
        "Q: What pattern does the sweater worn by the person holding a cup have?\n"
        "A: Checkered pattern\n\n"
    ),

    "numerical": (
        "\nInstructions:\n"
        "Each question must focus on numerical reasoning by counting or quantifying specific elements within the scene.\n"
        "This may include the number of people, objects, or other countable items present in the image.\n"
        "The answer must be a numerical value that accurately represents the count of the indicated elements.\n"
        "Do not generate questions about overly generic objects (e.g., items, objects).\n"
        "All numerical answers must be within the range of 0 to 5.\n"
        "Question Categories & Templates:\n"
        "Counting People (How many are there?)\n"
        "- How many people are in the scene?\n"
        "- How many individuals are facing the camera?\n"
        "\n"
        "Counting Objects (How many things are visible?)\n"
        "- How many objects is [person descriptor] holding?\n"
        "- How many items are on the table?\n"
        "\n"
        "Quantitative Comparisons (How do the numbers compare?)\n"
        "- How many more books are on the table than on the shelf?\n"
        "- By how much does the number of items in the man's hands exceed the number on the table?\n"
        "\nExamples:\n"
        "Q: How many people are in the scene?\n"
        "A: 3\n\n"
        "Q: How many objects is the woman in the striped shirt holding?\n"
        "A: 2\n\n"
        "Q: How many oranges are placed on the table?\n"
        "A: 5\n\n"
    ),
    "spatial": (
        "\nInstructions:\n"
        "Each question must explicitly reference an object's or a person's spatial relationship within the scene.\n"
        "The answer must be a specific object or location descriptor (e.g., scissors, frying pan, under the table).\n"
        "Do not generate QA pairs with overly generic answers.\n\n"
        "Question Categories & Templates:\n"
        "Object Proximity (What is closest or farthest?)\n"
        "- Which object is closest to the person wearing [specific item]?"
        "- Which object is the farthest from [reference point]?\n"
        "- What is the nearest object to [specific location or object]?\n\n"
        "Relative Positioning (Where are objects located?)\n"
        "- What object is to the left/right/front/behind of the man with [specific item]?\n"
        "- What object is to the left/right/front/behind [reference object]?\n"
        "- Which object is positioned above/below [reference object]?\n"
        "Spatial Relations (How are objects arranged?)\n"
        "- Which object is positioned between [object A] and [object B]?\n"
        "- What item is placed underneath/inside [object]?\n"
        "- Which object is located between the two people sitting on the bench?\n"

        "\nExamples:\n"
        "Q: What is the object on the far right of the desk?\n"
        "A: Scissors\n\n"
        "Q: Which cookware is closest to the woman wearing a striped shirt??\n"
        "A: Frying pan\n\n"
        "Q: What object is placed directly in front of the man wearing a cap?\n"
        "A: Backpack\n\n"
        "Q: What object is placed underneath the table?\n"
        "A: Storage box\n\n"
    ),

}

PROMPTS_PER_CATEGORIES_EGO_2 = {
    "action": (
        "\nInstructions:\n"
        "The answer must be a verb or verb phrase (e.g., writing, stretching, crossing arms).\n"
        "Do not generate overly generic answers like 'standing' or 'reaching'."
        "\Output format:\n"
        "Q: How is my body positioned?\n"
        "A: Sitting cross-legged\n\n"
    ),
    "object": (
        "\nInstructions:\n"
        "The answer must be a noun or noun phrase, avoiding overly generic responses such as 'something' or 'object'.\n"
        "\Output format:\n"
        "Q: What color is the shirt I am wearing?\n"
        "A: Navy blue\n\n"
    ),

    "numerical": (
        "\nInstructions:\n"
        "The answer must be a numerical value that accurately represents the count of the indicated elements.\n"
        "All numerical answers must be within the range of 0 to 5.\n"
        "\Output format:\n"
        "Q: How many people are in the image excluding me?\n"
        "A: 3\n\n"
    ),

    "spatial": (
        "\nInstructions:\n"
        "The answer must be a specific object or location descriptor (e.g., coffee cup, bookshelf, under the table).\n"
        "Do not generate overly generic answers."
        "\Output format:\n"
        "Q: What object is closest to my left hand?\n"
        "A: Coffee cup\n\n"
    )
}

PROMPTS_PER_CATEGORIES_EXO_2 = {
    "action": (
        "\nInstructions:\n"
        "The answer must be a verb or verb phrase (e.g., writing, stretching, crossing arms).\n"
        "Avoid overly generic answers like 'standing' or 'reaching'.\n\n"
        "\nOutput format:\n"
        "Q: What is the man sitting in the chair doing?\n"
        "A: Watching a phone\n\n"
    ),
    "object": (
        "\nInstructions:\n"
        "The answer must be a noun or noun phrase, avoiding overly generic responses such as 'something' or 'object'.\n"
        "\Output format:\n"
        "Q: What color is the top worn by the woman holding the towel?\n"
        "A: White\n\n"
    ),
    "numerical": (
        "\nInstructions:\n"
        "The answer must be a numerical value that accurately represents the count of the indicated elements.\n"
        "All numerical answers must be within the range of 0 to 5.\n"
        "\Output format:\n"
        "Q: How many people are in the scene?\n"
        "A: 3\n\n"
    ),
    "spatial": (
        "\nInstructions:\n"
        "The answer must be a specific object or location descriptor (e.g., scissors, frying pan, under the table).\n"
        "Do not generate with overly generic answers.\n\n"
        "\Output format:\n"
        "Q: What is the object on the far right of the desk?\n"
        "A: Scissors\n\n"
    )
}


def output_parser(out): # TODO check this function
    out = out.choices[0].message.content
    if "A:" not in out:
        return out
    try:
        _, answer = out.split("A:")
    except:
        return out

    return answer.strip()


def client_generation(key=None):
    if key is None:
        key = "" # API Key
    return OpenAI(
        api_key=key,
    )


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def single_qa_generation(args):
    disable_torch_init()

    image_files = [] # Video clip name
    client = client_generation()
    target_perspective = args.target_perspective
    random.seed(2025)


    for i, image_file in tqdm(enumerate(image_files), desc="Processing Folders"):
        if target_perspective == "ego":
            image_path = os.path.join(args.image_folder, image_file, "frame_aligned_videos")
            dir_list = os.listdir(image_path)
            dir_list = [item for item in dir_list if '214' in item]
            cam = dir_list[0]


        elif target_perspective == "exo":
            image_path = os.path.join(args.image_folder, image_file, "frame_aligned_videos")
            dir_list = os.listdir(image_path)
            dir_list = [item for item in dir_list if 'cam' in item or 'gp' in item]
            cam = random.choice(dir_list)

        else:
            raise NotImplementedError

        image_path = os.path.join(image_path, cam)

        category = args.category
        output_file = os.path.join(args.output_path, "single_qa_gen",
                                   f"{category}_{target_perspective}.jsonl")  # action, object, numerical, spatial
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        if target_perspective == "ego":
            prompt = (
                "You are given the visual input from the camera worn by the user (referred to as 'I').\n"
                "Based on this visual input, generate three question-answer pairs.\n"
                "Ensure that the generated question-answer pairs are directly based on the visual input.\n"
                f"{PROMPTS_PER_CATEGORIES_EGO[category]}"
                "Requirements:\n"
                "Each question must explicitly include the pronoun 'I' or 'me' to ensure the focus remains on the user.\n"
                "Each answer should be a single word or a short phrase.\n"
                "Ensure that all three question-answer pairs meet these criteria and are relevant to the visual input.\n"
                "Strictly adhere to the format of the provided examples."
            )
        else:
            prompt = (
                "You are given with the visual input from a fixed-position camera capturing a scene.\n"
                "Based on this visual input, generate three question-answer pairs."
                "Ensure that the generated question-answer pairs are directly based on the visual input.\n"
                f"{PROMPTS_PER_CATEGORIES_EXO[category]}"
                "Requirements:\n"
                "Each answer should be a single word or a short phrase.\n"
                "Ensure that all three question-answer pairs meet these criteria and are relevant to the visual input.\n"
                "Strictly adhere to the format of the provided examples."
            )

        with open(output_file, "a") as output_file:
            image_files = sorted([f for f in os.listdir(image_path) if f.endswith(".jpg")])
            num_indices = len(image_files)
            if num_indices < args.frames:
                selected_indices = list(range(num_indices))
            else:
                selected_indices = random.sample(range(num_indices), args.frames)
            selected_indices.sort()
            print(selected_indices)
            for i in selected_indices:
                image_file_name = image_files[i]

                image = encode_image(os.path.join(image_path, image_file_name))

                response = client.chat.completions.create(  # single Turn
                    model=args.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{image}",
                                        "detail": "high"
                                    },
                                },
                                {"type": "text", "text": prompt},
                            ],
                        }
                    ],
                    stream=False,
                    temperature=0,
                    max_tokens=4096,
                    # seed=2024
                )
                out = response.choices[0].message.content

                if "Q:" not in out:
                    continue
                questions = out.split("Q:")

                for qline in questions:
                    if qline == "":
                        continue
                    if "A:" not in qline:
                        continue
                    try:
                        question, answer = qline.split("A:")
                    except:
                        continue
                    question = question.strip()
                    answer = answer.strip()

                    output_file.write(json.dumps({
                        "question": question,
                        "answer_init": answer,
                        "image_folder": image_file,
                        "cam": cam,
                        "perspective": target_perspective,
                        "frame": image_file_name,
                        "id": shortuuid.uuid()
                    }) + "\n")

def multi_answer_generation_with_filtering(args):
    client = client_generation()
    single_qa_path = os.path.join(args.output_path, "single_qa_gen", f"{args.category}_{args.target_perspective}.jsonl")
    output_path = os.path.join(args.output_path, "filtered_out", f"{args.category}_{args.target_perspective}.jsonl")


    questions = [json.loads(q) for q in open(single_qa_path)]

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    out_file = open(output_path, "w")
    for i, line in tqdm(enumerate(questions), total=len(questions)):
        image_path = os.path.join(args.image_folder, line["image_folder"], "frame_aligned_videos")
        dir_list = os.listdir(image_path)

        if line["perspective"] == "ego":
            dir_list = [item for item in dir_list if 'cam' in item or 'gp' in item]
            cam_exo = random.choice(dir_list)
            cam_ego = line["cam"]
            image_path_ego = os.path.join(image_path, cam_ego, line["frame"])
            image_path_exo = os.path.join(image_path, cam_exo, line["frame"])
            prompt = PROMPTS_PER_CATEGORIES_EGO_2[args.category]

        elif line["perspective"] == "exo":
            dir_list = [item for item in dir_list if '214' in item]
            cam_ego = dir_list[0]
            cam_exo = line["cam"]
            image_path_ego = os.path.join(image_path, cam_ego, line["frame"])
            image_path_exo = os.path.join(image_path, cam_exo, line["frame"])
            prompt = PROMPTS_PER_CATEGORIES_EXO_2[args.category]

        else:
            raise NotImplementedError

        ego_image = encode_image(image_path_ego)
        exo_image = encode_image(image_path_exo)

        qs = f"{line['question']}\n"
        qs_text = (
                      "Based on the question, generate the best possible answer.\n"
                      f"{prompt}"
                      "Requirements:\n"
                      "Each answer should be a single word or a short phrase.\n"
                      "Follow the provided format strictly.\n\n"
                      "Q: "
                  ) + qs
        qs_ego = (
                     "You are given a visual input from a camera worn by the user (referred to as 'I') along with a corresponding question.\n"
                     "Based on the visual input, generate the best possible answer.\n"
                     f"{prompt}"
                     "Requirements:\n"
                     "Each answer options should be a single word or a short phrase.\n"
                     "Follow the provided format strictly.\n\n"
                     "Q: "
                 ) + qs
        qs_exo = (
                     "You are given a visual input from a fixed-position camera capturing a scene along with a corresponding question.\n"
                     "Based on the visual input, generate the best possible answer.\n"
                     f"{prompt}"
                     "Requirements:\n"
                     "Each answer should be a single word or a short phrase.\n"
                     "Follow the provided format strictly.\n\n"
                     "Q: "
                 ) + qs
        qs_both = (
                      "You are provided with two visual inputs in sequence, each captured from a different perspective:\n"
                      "1.The view from the camera worn by the user ('I').\n"
                      "2.The view captured by an external camera observing the user ('I').\n"
                      "These two images capture the same event at the same time.\n"
                      "Based on the visual inputs, generate the best possible answer.\n"
                      f"{prompt}"
                      "Requirements:\n"
                      "Each answer should be a single word or a short phrase.\n"
                      "Follow the provided format strictly.\n\n"
                      "Q: "
                  ) + qs

        answer_text = ""
        answer_ego = ""
        answer_exo = ""

        out_both = client.chat.completions.create(
            model=args.model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64, {ego_image}",
                                "detail": "high"

                            },
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64, {exo_image}",
                                "detail": "high"

                            },
                        },
                        {"type": "text", "text": qs_both},
                    ],

                }
            ],
            stream=False,
            temperature=0,
            max_tokens=args.max_tokens,
        )
        answer_both = output_parser(out_both)


        prompt_both_init = (f"Here is the question: '{qs}'. The provided answer is '{answer_both}', and the given label is '{line['answer_init']}'.\n"
                         f"Do they convey the same meaning based on the question? Respond with a single word or phrase.")
        out_both_init = client.chat.completions.create(
            model=args.model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt_both_init},
                    ],

                }
            ],
            stream=False,
            temperature=0,
            max_tokens=args.max_tokens,
        )
        if "yes" in out_both_init.choices[0].message.content.lower():
            filter_out = True
        else:
            if line["perspective"] == "ego":
                out_ego = client.chat.completions.create(
                    model=args.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64, {ego_image}",
                                        "detail": "high"
                                    },
                                },
                                {"type": "text", "text": qs_ego}
                            ],
                        }
                    ],
                    stream=False,
                    temperature=0,
                    max_tokens=args.max_tokens,
                )
                answer_ego = output_parser(out_ego)
                prompt_both_target = (
                    f"Here is the question: '{qs}'. The provided answer is '{answer_both}', and the given label is '{answer_ego}'.\n"
                    f"Do they convey the same meaning based on the question? Respond with a single word or phrase.")

                out_both_target = client.chat.completions.create(
                    model=args.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": prompt_both_target},
                            ],

                        }
                    ],
                    stream=False,
                    temperature=0,
                    max_tokens=args.max_tokens,
                )
                if "yes" in out_both_target.choices[0].message.content.lower():
                    filter_out = True
                else:
                    out_text = client.chat.completions.create(
                        model=args.model,
                        messages=[
                            {
                                "role": "user",
                                 "content": [
                                    {"type": "text", "text": qs_text},
                                ],
                            }
                        ],
                        stream=False,
                        temperature=0,
                        max_tokens=args.max_tokens,
                    )
                    answer_text = output_parser(out_text)
                    prompt_text = (
                        f"Here is the question: '{qs}'. The provided answer is '{answer_text}', and the given label is '{answer_ego}'.\n"
                        f"Do they convey the same meaning based on the question? Respond with a single word or phrase.")

                    out_text = client.chat.completions.create(
                        model=args.model,
                        messages=[
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": prompt_text},
                                ],
                            }
                        ],
                        stream=False,
                        temperature=0,
                        max_tokens=args.max_tokens,
                    )
                    if "yes" in out_text.choices[0].message.content.lower():
                        filter_out = True
                    else:
                        out_exo = client.chat.completions.create(
                            model=args.model,
                            messages=[
                                {
                                    "role": "user",
                                    "content": [
                                        {
                                            "type": "image_url",
                                            "image_url": {
                                                "url": f"data:image/jpeg;base64, {exo_image}",
                                                "detail": "high"

                                            },
                                        },
                                        {"type": "text", "text": qs_exo},
                                    ],
                                }
                            ],
                            stream=False,
                            temperature=0,
                            max_tokens=args.max_tokens,
                        )
                        answer_exo = output_parser(out_exo)
                        filter_out = False

            else:
                out_exo = client.chat.completions.create(
                    model=args.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64, {exo_image}",
                                        "detail": "high"
                                    },
                                },
                                {"type": "text", "text": qs_exo}
                            ],
                        }
                    ],
                    stream=False,
                    temperature=0,
                    max_tokens=args.max_tokens,
                )
                answer_exo = output_parser(out_exo)
                prompt_both_target = (
                    f"Here is the question: '{qs}'. The provided answer is '{answer_both}', and the given label is '{answer_exo}'.\n"
                    f"Do they convey the same meaning based on the question? Respond with a single word or phrase.")

                out_both_target = client.chat.completions.create(
                    model=args.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": prompt_both_target},
                            ],

                        }
                    ],
                    stream=False,
                    temperature=0,
                    max_tokens=args.max_tokens,
                )
                if "yes" in out_both_target.choices[0].message.content.lower():
                    filter_out = True
                else:
                    out_text = client.chat.completions.create(
                        model=args.model,
                        messages=[
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": qs_text},
                                ],
                            }
                        ],
                        stream=False,
                        temperature=0,
                        max_tokens=args.max_tokens,
                    )
                    answer_text = output_parser(out_text)
                    prompt_text = (
                        f"Here is the question: '{qs}'. The provided answer is '{answer_text}', and the given label is '{answer_exo}'.\n"
                        f"Do they convey the same meaning based on the question? Respond with a single word or phrase.")

                    out_text = client.chat.completions.create(
                        model=args.model,
                        messages=[
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": prompt_text},
                                ],
                            }
                        ],
                        stream=False,
                        temperature=0,
                        max_tokens=args.max_tokens,
                    )
                    if "yes" in out_text.choices[0].message.content.lower():
                        filter_out = True
                    else:
                        out_ego = client.chat.completions.create(
                            model=args.model,
                            messages=[
                                {
                                    "role": "user",
                                    "content": [
                                        {
                                            "type": "image_url",
                                            "image_url": {
                                                "url": f"data:image/jpeg;base64, {ego_image}",
                                                "detail": "high"

                                            },
                                        },
                                        {"type": "text", "text": qs_ego},
                                    ],
                                }
                            ],
                            stream=False,
                            temperature=0,
                            max_tokens=args.max_tokens,
                        )
                        answer_ego = output_parser(out_ego)
                        filter_out = False


        out_file.write(json.dumps({
            "frame": line["frame"],
            "perspective": line["perspective"],
            "question": line["question"],
            "answer_init": line["answer_init"],
            "answer_ego": answer_ego,
            "answer_exo": answer_exo,
            "answer_both": answer_both,
            "answer_text": answer_text,
            "filter_out": filter_out,
            "image_folder": line["image_folder"],
            "cam_ego": cam_ego,
            "cam_exo": cam_exo,
            "id": line["id"]
        }) + "\n")
    out_file.close()



def option_generation(args):
    client = client_generation()
    filtered_qa_path = os.path.join(args.output_path, "filtered_out", f"{args.category}_{args.target_perspective}.jsonl")
    output_path = os.path.join(args.output_path, "option_gen", f"{args.category}_{args.target_perspective}.jsonl")

    questions = [json.loads(q) for q in open(filtered_qa_path)]

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    out_file = open(output_path, "w")
    for i, line in tqdm(enumerate(questions), total=len(questions)):
        if line["filter_out"]:
            continue

        image_path = os.path.join(args.image_folder, line["image_folder"], "frame_aligned_videos")

        image_path_ego = os.path.join(image_path, line["cam_ego"], line["frame"])
        image_path_exo = os.path.join(image_path, line["cam_exo"], line["frame"])

        ego_image = encode_image(image_path_ego)
        exo_image = encode_image(image_path_exo)

        answer_ego = line["answer_ego"]
        answer_exo = line["answer_exo"]

        qs = f"{line['question']}\n"

        option_prompt_ego = (
            "You are given a visual input from a camera worn by the user (referred to as 'I').\n"
            "Based on the following question and answer, generate four multiple-choice options.\n"
            f"Question: {qs}\n"
            f"Answer: {answer_ego}\n"
            "\n"
            "Ensure that each incorrect option is closely related to the visual content, making it challenging to easily identify the correct answer.\n"
            "Follow the format below exactly:\n"
            "\n"
            "Options:\n"
            "[Option1]\n"
            "[Option2]\n"
            "[Option3]\n"
            "[Option4]\n"
        )

        option_prompt_exo = (
            "You are given a visual input from a fixed-position camera capturing a scene.\n"
            "Based on the following question and answer, generate four multiple-choice options.\n"
            f"Question: {qs}\n"
            f"Answer: {answer_exo}\n"
            "\n"
            "Ensure that each incorrect option is closely related to the visual content, making it challenging to easily identify the correct answer.\n"
            "Follow the format below exactly:\n"
            "\n"
            "Options:\n"
            "[Option1]\n"
            "[Option2]\n"
            "[Option3]\n"
            "[Option4]\n"
        )

        out_ego = client.chat.completions.create(
            model=args.model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64, {ego_image}",
                                "detail": "high"
                            },
                        },
                        {"type": "text", "text": option_prompt_ego}
                    ],
                }
            ],
            stream=False,
            temperature=0,
            max_tokens=args.max_tokens,
        ).choices[0].message.content

        out_exo = client.chat.completions.create(
            model=args.model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64, {exo_image}",
                                "detail": "high"

                            },
                        },
                        {"type": "text", "text": option_prompt_exo},
                    ],
                }
            ],
            stream=False,
            temperature=0,
            max_tokens=args.max_tokens,
        ).choices[0].message.content


        out_file.write(json.dumps({
            "frame": line["frame"],
            "perspective": line["perspective"],
            "question": line["question"],
            "options_ego": out_ego,
            "options_exo": out_exo,
            "answer_init": line["answer_init"],
            "answer_ego": line["answer_ego"],
            "answer_exo": line["answer_exo"],
            "answer_both": line["answer_both"],
            "answer_text": line["answer_text"],
            "image_folder": line["image_folder"],
            "cam_ego": line["cam_ego"],
            "cam_exo": line["cam_exo"],
            "id": line["id"]
        }) + "\n")
    out_file.close()




if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="gpt-4o-2024-08-06")
    parser.add_argument("--image-folder", type=str)
    parser.add_argument("--output-path", type=str)
    parser.add_argument("--target-perspective", type=str, default=None)
    parser.add_argument("--category", type=str, default=None)
    parser.add_argument("--frames", type=int, default=8)
    parser.add_argument("--max-tokens", type=int, default=4096)

    args = parser.parse_args()
    single_qa_generation(args)
    multi_answer_generation_with_filtering(args)
    option_generation(args)
