import os
from moviepy.video.io.VideoFileClip import VideoFileClip
import openai
from PIL import Image
import numpy as np

openai.api_base = "xxx"
openai.api_key = "xxx"
# The api and key

def sample_frames(video, num_frames):
    total_frames = int(video.fps * video.duration)
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = [video.get_frame(i / video.fps) for i in frame_indices]
    return frames


def convert_to_base64(image):
    from io import BytesIO
    import base64

    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


for i in range(42):
    v_list = ["00008816", "00000645", "00000732", "00001024", "00001112", "00001358", "00001395", "00001833",
              "00002001", "00002904", "00003193", "00003809", "00003858", "00003940", "00004209", "00004465",
              "00004772", "00005339", "00005418", "00006360", "00006449", "00006598", "00006675", "00006719",
              "00006798", "00006873", "00006968", "00007104", "00007144", "00007421", "00007553", "00007683",
              "00007744", "00007800", "00007835", "00007900", "00007947", "00008036", "00008196", "00008418",
              "00008558", "00008680"]

    audio_video_path = "./dataset/" + v_list[i] + ".mp4"

    # f"This video lasts for 10.0 seconds. Question: What sounds should this video have?"
    import json
    import re

    with open("output_data.json", "r", encoding="utf-8") as input_file:
        json_data = json.load(input_file)


    def find_question_by_video_id(video_id, json_data):
        for item in json_data:
            if item["video_id"] == video_id:
                return item["question_content"]
        return None


    def find_answer_by_video_id(video_id, json_data):
        for item in json_data:
            if item["video_id"] == video_id:
                return item["anser"]
        return None


    video_id_to_find = v_list[i]
    question_info = find_question_by_video_id(video_id_to_find, json_data)
    anser_info = find_answer_by_video_id(video_id_to_find, json_data)


    def generate_clip_descriptions(clip_path, num_frames=10):
        # print(clip_path)
        descriptions = []
        if clip_path:
            clip = VideoFileClip(clip_path)
            frames = sample_frames(clip, num_frames)
            # print(frames)
            images = [Image.fromarray(frame) for frame in frames]
            # images[0].show()
            image_files = [convert_to_base64(image) for image in images]


            # prompt
            question = str(question_info)
            # Standard CoT and V2A-CoT
            '''
            question = str(question_info) + "Let's think step by step."
            
            question = str(question_info) + "
                        To answer the question, please think as following:
                        1. Please describe the overall scene presented in the video/images. Based on your understanding of the scene, list the common sounds that may be related to the scene from a common sense perspective.
                        2. What are the main characters or objects in the picture? What actions are these subjects taking? Based on their actions, what sounds might be produced?
                        3. Observe the background environment in the video/images, and combine environmental and background clues to infer the possible ambient sounds in the scene.
                        4. If people or objects are colliding or interacting in the picture, please list the possible sources of sound
                        5. Based on your understanding of the social culture or life experience of the scene, what common sounds usually appear in such scenes?
                        6. Integrate the above analysis. First, list the main possible sounds, and then add secondary or less certain sounds. Based on the current picture, give a list of the most likely sounds and explain the confidence or priority if necessary.
                        " + "\nQuestion:" + str(question_info)
   
            question=question_info+"
                        To answer the question, please think as following:
                        1.List possible background sounds based on the scene and environment in the picture.
                        2.Analyze the characters or objects in the picture, their movements, and the main sounds that may be emitted.
                        3.What obvious interactions or collision actions are noticed, and what kind of sounds may be produced.
                        4.Combining common sense and cultural background, add some less obvious but common voices.
                        5.If there are any uncertain details, please list different possibilities separately and indicate the reasons.
                        6.Finally, please provide an integrated list of sounds, including primary and secondary sounds, and provide explanations for areas where you feel uncertain.
                        " + "\nQuestion:" + str(question_info)
            '''
            prompt = question


            content = [{"type": "text", "text": prompt}]
            for image_file in image_files:
                content.append({
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image_file}"},
                })

            response = openai.Completion.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": content
                    }
                ],
                max_tokens=1000,
                temperature=0.1
            )
            descriptions.append(response.choices[0].message.content)

        return descriptions


    if __name__ == "__main__":
        video_path = audio_video_path
        output_dir = "clips"
        os.makedirs(output_dir, exist_ok=True)

        clip_descriptions = generate_clip_descriptions(video_path, 5)
        # print("Clip Descriptions:", clip_descriptions)

        output = str(clip_descriptions)
        with open("result.txt", "a", encoding="utf-8") as file:
            file.write("\n***********" + str(v_list[i]) + "**********\n")
            file.write(question_info + "\n&&&&&&&&&&\n" + anser_info)
            file.write("\n%%%%%%%%%%%%%%\n")
            file.write(output)
            file.write("\n===========\n")

        print(i)
