import json
import uuid
import mimetypes
import os

import PIL


def encode_image(image_path):
    if image_path.startswith("http"):
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
        request_kwargs = {
            "headers": {"User-Agent": user_agent},
            "stream": True,
        }

        # Send a HTTP request to the URL
        response = requests.get(image_path, **request_kwargs)
        response.raise_for_status()
        content_type = response.headers.get("content-type", "")

        extension = mimetypes.guess_extension(content_type)
        if extension is None:
            extension = ".download"

        fname = str(uuid.uuid4()) + extension
        download_path = os.path.abspath(os.path.join("downloads", fname))

        with open(download_path, "wb") as fh:
            for chunk in response.iter_content(chunk_size=512):
                fh.write(chunk)

        image_path = download_path

    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
}

import base64
import requests
from PIL import Image
from io import BytesIO


def pil_image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format='PNG')
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


def resize_image(image_path):
    img = Image.open(image_path)
    width, height = img.size
    img = img.resize((int(width / 2), int(height / 2)))
    new_image_path = f"resized_{image_path}"
    img.save(new_image_path)
    return new_image_path


class Conversation:
    def __init__(self,
                 image_type,
                 system_prompt_available,
                 system_content=None,
                 placeholder='<image>'):
        self.system_prompt_available = system_prompt_available
        self.system_content = system_content
        self.image_type = image_type
        self.PLACE_HOLDER = placeholder
        # print(type(self.system_content))

    def format(self,
               sample,
               image_path=None,
               permutation=None):
        if image_path is None:
            raise ValueError("No image path provided.")
        system_role = "system"
        if not self.system_prompt_available:
            system_role = "user"

        if self.image_type == 'base64':
            return [
                {"role": system_role, "content": self.system_content},
                {"role": "user", "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encode_image(image_path)}"}
                    },
                    {
                        "type": "text",
                        "text": f"""
            Question:{sample['question']}
            Choices:
            A. {sample['options'][permutation[0]]}
            B. {sample['options'][permutation[1]]}
            C. {sample['options'][permutation[2]]}
            D. {sample['options'][permutation[3]]}
            Your answer:"""
                    },
                ]}
            ]
        elif self.image_type == 'path':
            return [
                {"role": system_role, "content": self.system_content},
                {"role": "user", "content": [
                    {
                        "type": "image",
                        "image": "file://" + image_path
                     },{
                        "type": "text",
                        "text": f"""
            Question:{sample['question']}
            Choices:
            A. {sample['options'][permutation[0]]}
            B. {sample['options'][permutation[1]]}
            C. {sample['options'][permutation[2]]}
            D. {sample['options'][permutation[3]]}
            Your answer:"""
                    },
                ]}
            ]
        elif self.image_type == 'placeholder':
            return [
                {"role": system_role, "content": self.system_content},
                {"role": "user", "content": f"""
{self.PLACE_HOLDER}
                
Question:{sample['question']}
Choices:
A. {sample['options'][permutation[0]]}
B. {sample['options'][permutation[1]]}
C. {sample['options'][permutation[2]]}
D. {sample['options'][permutation[3]]}
Your answer:"""
                }
            ]

prompt_cot_instruction_v0 = """
You are a helpful agent.Here is an image with a multiple choice question about the image content. You should reply the question according to the image faithfully. Please note that the question maybe confusing or the image content might be uncommon, You should thinking briefly first and you **MUST give your final choose with <answer></answer>**.
You should follow the format below STRICTLY
format:  Think first, give your discussion about the question and the image BRIEFLY. Then summarize: The final answer is <answer>[A/B/C/D]</answer>.
Here is an example:
#########
[IMAGE]
Question:Does the Teapot in the picture have a handle? If so, where is it located?
Choices:
A. Not visible / Can't see.
B. Yes, on the side.
C. Yes, arched over the top.
D. The correct answer is not listed.

Your reply:
"From the image I can see the handle on the side clearly, so the answer is <answer>C</answer>."
#########
Now please answer the question following the above format STRICTLY.
"""

prompt_cot_instruction_v0_long_response = """
You are a helpful agent.Here is an image with a multiple choice question about the image content. You should reply the question according to the image faithfully. Please note that the question maybe confusing or the image content might be uncommon, You should thinking briefly first and you **MUST give your final choose with the letter(A/B/C/D) in <answer></answer>**.
You should follow the format below.
format:  Think first, give your discussion about the question and the image content. Then summarize: The final answer is <answer>[A/B/C/D]</answer>.
Here is an example:
#########
[IMAGE]
Question:Does the Teapot in the picture have a handle? If so, where is it located?
Choices:
A. Not visible / Can't see.
B. Yes, on the side.
C. Yes, arched over the top.
D. The correct answer is not listed.

Your reply:
I'll look at the overall structure - I can see it's a white/ceramic teapot. Next, I need to locate the handle. Looking at the left side of the teapot, I don't see a handle there. Checking the right side... I can see what appears to be a curved handle attached to the side. Based on this systematic examination, the handle is located on the side of the teapot. Based on above analysis, I will stop thinking, and the final answer is <answer>B</answer>.
#########
"""

prompt_cot_instruction_v0_kimi = """
You are a helpful agent.Here is an image with a multiple choice question about the image content. You should reply the question according to the image faithfully. Please note that the question maybe confusing or the image content might be uncommon, You should thinking briefly first and you **MUST give your final choose with <answer></answer>**.
You should follow the format below STRICTLY
format:  Think first, give your discussion about the question and the image BRIEFLY. Then summarize: The final answer is <answer>[A/B/C/D]</answer>.
Here is an example:
#########
[IMAGE]
Question:Does the Teapot in the picture have a handle? If so, where is it located?
Choices:
A. Not visible / Can't see.
B. Yes, on the side.
C. Yes, arched over the top.
D. The correct answer is not listed.

Your reply:
"◁think▷From the image I can see the handle on the side clearly, so the answer is C◁/think▷C."
#########
Now please answer the question following the above format.
"""

prompt_cot_instruction_v1 = """
You are a helpful agent. Here is an image with a multiple choice question. The image content might be uncommon or the question might be confusing, so you should analyze the image systematically and provide step-by-step reasoning. Moreover, take time to examine details carefully. Finally, you **MUST give your final choose with <answer></answer>.

You should follow the FORMAT below:
Image Observation: [Your observation]
Question Analysis: [Break down the question]  
Step-by-step Reasoning: [Analyze each option A, B, C, D]
Final Decision: The answer is <answer>[A/B/C/D]</answer>

#####################
Example:
[IMAGE of teapot]
Question: Does the Teapot in the picture have a handle? If so, where is it located?
Choices:
A. Not visible / Can't see.
B. Yes, on the side.
C. Yes, arched over the top.
D. The correct answer is not listed.

Your reply:
"Image Observation: I see a white ceramic teapot with a rounded body and spout extending from one side.
Question Analysis: I need to determine if there's a handle and identify its specific location.
Step-by-step Reasoning: 
- Option A: Let me check if I can see a handle... I can clearly see one, so this is incorrect
- Option B: Looking at the sides of the teapot, I see a curved handle attached to the right side
- Option C: Checking the top area, I don't see any handle arching over the top
- Option D: Since I found the handle on the side, one of the listed answers is correct
Final Decision: The handle is clearly visible on the side of the teapot. The answer is <answer>B</answer>."
#####################
"""

prompt_cot_instruction_v2 = """
You are a helpful agent. Here is an image with a multiple choice question. The image content might be uncommon or the question might be confusing, so you should analyze the image systematically and provide step-by-step reasoning. Moreover, take time to examine details carefully. Finally, you **MUST** give your final choose with <answer></answer>.
Remember that you should think step by step. Take time to examine details carefully. But when you come to the final answer, please provide your choose with the character(A/B/C/D) in <answer></answer>!
Most IMPORTANTLY: finally provide your choice in <answer></answer>! For example: <answer>A</answer> <answer>B</answer> <answer>C</answer> <answer>D</answer>. 
"""

prompt_cot_instruction_v3 = """
You are a helpful agent. Here is an image with a multiple choice question. The image content might be uncommon or the question might be confusing, so you should analyze the image systematically and provide step-by-step reasoning. Moreover, take time to examine details carefully. Finally, you **MUST** give your final choose with <answer></answer>.
Remember that you should THINK STEP BY STEP. Take time to examine details carefully. But when you come to the final answer, please provide your choose with the character(A/B/C/D) in <answer></answer>!
Most IMPORTANTLY: finally provide your choice in <answer></answer>! For example: <answer>A</answer> <answer>B</answer> <answer>C</answer> <answer>D</answer>. 
Example:
#########
[IMAGE]
Question:Does the Teapot in the picture have a handle? If so, where is it located?
Choices:
A. Not visible / Can't see.
B. Yes, on the side.
C. Yes, arched over the top.
D. The correct answer is not listed.

Your reply:
I'll look at the overall structure - I can see it's a white/ceramic teapot. Next, I need to locate the handle. Looking at the left side of the teapot, I don't see a handle there. Checking the right side... I can see what appears to be a curved handle attached to the side. Based on this systematic examination, the handle is located on the side of the teapot. Based on above analysis, I will stop thinking, and the final answer is <answer>B</answer>.
#########
Follow the instruction and the example.
"""
prompt_cot_instruction_v4 = """
You are a helpful agent.Here is an image with a multiple choice question about the image content. You should reply the question according to the image faithfully. Please note that the question maybe confusing or the image content might be uncommon, you should answer the question ONLY with the correct choice letter.
Here is an example:
#########
[IMAGE]
Question:Does the Teapot in the picture have a handle? If so, where is it located?
Choices:
A. Not visible / Can't see.
B. Yes, on the side.
C. Yes, arched over the top.
D. The correct answer is not listed.

Your answer:A
#########
Now please answer the question following the above format STRICTLY.
"""




def get_multi_modal_inputs(conversation, sample, image_path, permutation, tokenizer):
    prompt_json = conversation.format(sample, image_path, permutation)
    prompt_template = tokenizer.apply_chat_template(
        prompt_json, tokenize=False, add_generation_prompt=True
    )
    # print('@@@@@@@@@@@@TEMPLATE',prompt_template)
    return {
        "prompt": prompt_template,
        "multi_modal_data":{
            "image": PIL.Image.open(image_path)
        }
    }


