import os
import json
import time
import argparse
import openai
from pathlib import Path
from tqdm import tqdm 
import copy 
import torch
import numpy as np 
from torchvision.ops import box_convert
import supervision as sv
import cv2
from PIL import Image 
import re 

from dotenv import dotenv_values
config = dotenv_values(".env")

## don't forget to set up these api details ##
openai.api_key = config['API_KEY']
openai.api_base = config['API_BASE']
openai.api_type = config['API_TYPE']
openai.api_version = config['API_VERSION']


device = "cuda"
deployment_name = 'gpt-4'
gpt_name = {
    'gpt3.5': 'text-davinci-003',
    'gpt3.5-chat': 'gpt-3.5-turbo',
    'gpt-4': 'gpt-4',
}


def annotate(img: np.ndarray, boxes: torch.Tensor, phrases, in_fmt="cxcywh", rgb_switch=True) -> np.ndarray:
    h, w, _ = img.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    xyxy = box_convert(boxes=boxes, in_fmt=in_fmt, out_fmt="xyxy").numpy()
    detections = sv.Detections(xyxy=xyxy)
    labels = [f"{phrase}: {(box).to(int).tolist()}" for phrase, box in zip(phrases, boxes)]
    box_annotator = sv.BoxAnnotator(text_padding = -12)
    annotated_frame = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    if not rgb_switch:
        annotated_frame = cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR)
    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
    return annotated_frame


def plot_bbox_from_layout(layout):
    boxes = torch.Tensor([p[1] for p in layout])
    phrases = [p[0] for p in layout]
    base_size=512, 512,3
    img = np.ones(base_size,dtype=np.uint8) * 255
    annotated_frame = annotate(img, boxes, phrases, in_fmt='xyxy')
    enlarged_size = 532, 532, 3
    enlarged=np.ones(enlarged_size,dtype=np.uint8) * 255
    enlarged[10:base_size[0]+10,10:base_size[1]+10]=annotated_frame 
    return enlarged 


def check_response_8frames(line_list):
    try:
        assert len(line_list) == 10, "ERROR: line_list should be of size 9+1=10"
        for i in range(9):
            assert line_list[i].startswith(f"Frame_{i+1}"), f"ERROR: line_list line {i} does not start with Frame_{i+1}"
            assert "caption:" in line_list[i], f"ERROR: line_list[{i}] does not contain caption"
        assert "Reasoning" in  line_list[-1], "ERROR: no reasoning in last line of line_list"
    except:
        print(line_list)
    return True 


def parse_layouts(raw_layouts):
    parsed_layouts = []
    parsed_captions = []
    for i in range(len(raw_layouts)):
        all_pairs = []
        t = copy.deepcopy(raw_layouts[i])
        t = t.replace("\'", "\"")
        parsed_captions.append(raw_layouts[i].split('caption:')[-1].strip().replace("\"", "").lower().replace(".", ""))
        while True:
            try:
                obj_layout_pair = re.search('\[\"[^"]+\", \[(\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?\]\]', t).group(0)
            except:
                break
            obj = obj_layout_pair.split("\"")[1]
            layout = re.search('\[(\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?\]', obj_layout_pair).group(0)
            layout = layout.replace('[', '').replace(']', '').split(',')
            layout = [l.strip() for l in layout]
            layout = [float(ele) for ele in layout]
            all_pairs.append([obj, layout])
            t = t.replace(obj_layout_pair, '')
        parsed_layouts.append(all_pairs)
    return parsed_layouts, parsed_captions 


def query2instruction(args, user_prompt = "Provide step-by-step instructions to the task: make butter biscuits"):
    system_prompt = 'Given a single text prompt, you need to envision a multi-scene video by generating a sequence of stepwise prompts to describe the text prompt. ' \
                    'For each step, you also need to generate the set of entities needed and describe the background scene where the video should occur. ' \
                    'Related steps should maintain similar entities and background scenes. Before you write each stepwise description, you must follow these instructions: ' \
                    '1. Each step prompt must contain only a single motion or action. ' \
                    '2. Each step prompt must include all relevant objects and describe the environment scene. ' \
                    '3. Make sure each step prompt must be easy described by a shot video cilp of 8 seconds at 2fps.' \
                    'You MUST respond with the following format:\n[[step_1: [Prompt:..., Entities:..., Environment:...]; \nstep_2: [Prompt:..., Entities:..., Environment:...]; \n...; \nstep_n: [Prompt:..., Entities:..., Environment:...]]'
    messages = [{"role": "system", "content": system_prompt}]
    example_user_prompt = "Provide step-by-step instructions to the task: make butter biscuits"
    example_assistant_prompt = '[[Step 1: [Prompt: an old chef in a white shirt preheats a stainless steel oven with digital controls; Entities: an old chef in a white shirt full-body view, a stainless steel oven with digital controls; Environment: kitchen counter];\n' \
                            'Step 2: [Prompt: an old chef in a white shirt creams together butter and sugar in a large white mixing bowl using a steel hand mixer; Entities: an old chef in a white shirt full-body view, a large white mixing bowl, butter, sugar, steel hand mixer; Environment: kitchen counter];\n' \
                            'Step 3: [Prompt: an old chef in a white shirt beats in eggs into a large white mixing bowl; Entities: an old chef in a white shirt full-body view, eggs, a large white mixing bowl; Environment: kitchen table surface];\n' \
                            'Step 4: [Prompt: an old chef in a white shirt combines flour, baking powder, and salt in a second yellow bowl; Entities: an old chef in a white shirt full-body view, flour, baking powder, salt, yellow bowl; Environment: kitchen table surface];\n' \
                            'Step 5: [Prompt: an old chef in a white shirt gradually adds the dry ingredients from the second yellow bowl into the creamed mixture in the first large white mixing bowl; Entities: an old chef in a white shirt full-body view, dry ingredients, a large white mixing bowl, yellow bowl; Environment: kitchen table surface];\n' \
                            'Step 6: [Prompt: an old chef in a white shirt forms dough into small balls on the kitchen table; Entities: an old chef in a white shirt full-body view, dough balls; Environment: kitchen table surface];\n' \
                            'Step 7: [Prompt: an old chef in a white shirt places the dough on a baking sheet; Entities: an old chef in a white shirt full-body view, dough balls, baking sheet; Environment: kitchen table surface];\n' \
                            'Step 8: [Prompt: an old chef in a white shirt flattens each ball of dough with a fork; Entities: an old chef in a white shirt full-body view, flattened dough, a steel fork; Environment: kitchen table surface];\n' \
                            'Step 9: [Prompt: an old chef in a white shirt bakes the dough in a stainless steel oven with digital controls oven until the edges of the dough are lightly golden; Entities: an old chef in a white shirt full-body view, a stainless steel oven with digital controls, dough with gloden edge; Environment: kitchen counter];\n' \
                            'Step 10: [Prompt: an old chef in a white shirt removes the butter biscuits from a stainless steel oven with digital controls; Entities: an old chef in a white shirt full-body view, butter biscuits, a stainless steel oven with digital controls; Environment: kitchen counter];\n' \
                            'Step 11: [Prompt: an old chef in a white shirt cools the butter biscuits on a wire cookie rack; Entities: an old chef in a white shirt full-body view, butter biscuits, wire cookie rack; Environment: kitchen table surface]]'
    messages.append({"role": "user", "content": example_user_prompt})
    messages.append({"role": "assistant", "content": example_assistant_prompt})
    input_prompt = f"Provide step-by-step instructions to the task: {user_prompt}"   
    messages.append({"role": "user", "content": input_prompt}) 
    f_gpt_create = openai.ChatCompletion.create
    while True:
        try:
            response = f_gpt_create(
                    engine=args.deployment_name, messages=messages, max_tokens=4000, n=1)
            break
        except openai.error.ServiceUnavailableError:
            print('OpenAI ServiceUnavailableError.\tWill try again in 5 seconds.')
            time.sleep(5)
        except openai.error.RateLimitError:
            print('OpenAI RateLimitError.\tWill try again in 5 seconds.')
            time.sleep(5)
        except openai.error.InvalidRequestError as e:
            print(e)
            print('Input too long. Will shrink the prompting examples.')
    i_iter = 0
    predicted_object_list = []
    if args.gpt_type == 'gpt3.5':
        line_list = response.choices[i_iter]["text"].split('\n')
    else:
        line_list = response.choices[i_iter]["message"]["content"].split('\n')
    for line in line_list:
        if line == '':
            continue
        if "Prompt: " not in line or "Entities: " not in line or "Environment: " not in line or "Step " not in line:
            continue 
        line = line.replace("<|im_sep|>", "")
        line = line.replace("<|im_end|>", "")
        selector_text = line.split('Prompt: ')[1].split('; Entities')[0] 
        objects = (line.split('Entities: ')[1].split('; Environment')[0]).split(', ')
        environment = (line.split('Environment: ')[1].split('];')[0]).replace("]", "").strip()
        if selector_text == None:
            print(line)
            continue
        dict_ = {'prompt': selector_text, 'objects': objects, 'environment': environment}
        predicted_object_list.append(dict_)
    output_path = os.path.join(args.output_dir, "step1_scene_generation")
    Path(output_path).mkdir(parents=True, exist_ok=True)
    output_filename = os.path.join(output_path, f'{user_prompt}.json')
    with open(output_filename, 'w') as fout:
        json.dump(predicted_object_list, fout, sort_keys=True, indent=4)
    return True 


def prompt_to_8frame_layouts_with_entities(args, prompt, entities, detailed_in_context_examples = True):
    args.gpt_name = gpt_name[args.gpt_type]
    system_prompt = "Assuming the frame size is normalized to the range 0-1, you need to give a possible 9-frame layout with bounding boxes of the listed entities of a given scene description. " \
                    "Each object in the image is one rectangle or square box in the layout and size of boxes should be as large as possible. " \
                    "You need to generate layouts from the close up camera view of the event. " \
                    "The layout difference between two adjacent frames must be small, considering the small interval. " \
                    "You also need to generate a caption that best describes the image for each frame. After generating all frames, add reasoning to your design." \
                    "Use format: Frame_1: [[object1, [left, top, right, bottom]], [object2, [left, top, right, bottom]], ..., [object_n, [left, top, right, bottom]]], caption:...\nFrame_2: [[object1, [left, top, right, bottom]], [object2, [left, top, right, bottom]], ..., [object_n, [left, top, right, bottom]]], caption:...\n...\nFrame_9: [[object1, [left, top, right, bottom]], [object2, [left, top, right, bottom]], ..., [object_n, [left, top, right, bottom]]], caption:...\nReasoning:...\nScore:...'}"
    messages = [{"role": "system", "content": system_prompt}]
    example_input_prompt = "Please generate bounding box coordinates for the following entities based on the scene description. Entities: ['an orange cat', 'a yellow wood chair', 'a red dining table with round shape']. Scene description: a cat jumps from chair to table"
    example_output_prompt = """Frame_1: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.4, 0.55, 0.6, 0.6]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: A cat is standing on the table, preparing to jump.\nFrame_2: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.45, 0.5, 0.65, 0.55]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat has just left the table and is in mid-air.\nFrame_3: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.5, 0.45, 0.7, 0.5]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat is in the middle of the jump, right between the table and the chair. \nFrame_4: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.55, 0.4, 0.75, 0.45]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat continues her flight, reaching the highest point.\nFrame_5: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.6, 0.45, 0.8, 0.5]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat starts descending towards the chair; she is still in the air.\nFrame_6: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.65, 0.5, 0.85, 0.55]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat is about to land on the chair. \nFrame_7: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.15, 0.6, 0.35, 0.65]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat lands on the chair skilfully. \nFrame_8: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.15, 0.6, 0.35, 0.65]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat sitting on the chair after the jump. \nFrame_9: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.15, 0.6, 0.35, 0.65]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat still resting on the chair, with the jump completed successfully.\n\nReasoning: The cat\'s bounding box in Frame 1 overlaps with the bounding box of the table, showing the cat in the take-off position. From Frame 2 to Frame 6, the cat is shown in various stages mid-flight, with the cat\'s bounding box moving closer to the chair with each frame. This demonstrates the cat\'s jump from the table to the chair. This jump is confirmed in Frame 7 as the cat\'s bounding box is now overlapping with the bounding box of the chair, showing that the cat has landed. Frame 8 and 9 show the cat resting on the chair after the jump. The smooth increment in the cat\'s coordinates, moving from the table to the chair, justifies the small intervals between frames. The size of the bounding boxes has been kept as large and consistent as possible for clear visualization. Finally, the spatial control for this prompt is important, since the bounding box layouts give us rich information about how the cat moves from chair to table. Therefore, the importance score for spatial control should be very high.\nScore: 0.3"""
    messages.append({"role": "user", "content": example_input_prompt})
    messages.append({"role": "assistant", "content": example_output_prompt})
    # example 1
    example_input_prompt = "Please generate bounding box coordinates for the following entities based on the scene description. Entities: ['sandra smith in a blue dress', 'ed henry in a red tie']. Scene description: sandra smith in a blue dress and ed henry in a red tie sitting next to each other"
    example_output_prompt = """Frame_1: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith in a blue dress and ed henry in a red tie sitting next to each other.\nFrame_2: [["sandra smith in a blue dress, full-body view", [0.1, 0.25, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith and ed henry continue their discussion.\nFrame_3: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: ed henry makes a point while sandra listens. \nFrame_4: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith responds to ed henry's point.\nFrame_5: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: the discussion continues between sandra smith and ed henry.\nFrame_6: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra and ed henry pause to think. \nFrame_7: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: ed makes another point in the discussion. \nFrame_8: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith responds while ed henry listens. \nFrame_9: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith and ed henry conclude their discussion next to each other.\n\nReasoning: Since Sandra Smith and Ed Henry are sitting next to each other throughout all the frames, their bounding boxes stay consistent. Close-up camera views are used to focus on both of the individuals, highlighting their reactions and interactions. This situation involves less movement, so the small interval between frames lets us capture the ongoing discussion between Sandra and Ed, each in their respective attires. Bounding boxes for the blue dress and red tie are incorporated within the Sandra's and Ed's bounding boxes. The coordination depicted through bounding boxes suggests that the spatial positioning of Sandra and Ed remains relatively unchanged across the frames. Finally, the spatial control for this prompt is not important, since sandra smith and ed henry are sitting, so their bounding box can just be static without movement. Therefore, the importance score for spatial control should be very low.\nScore: 0.0"""
    messages.append({"role": "user", "content": example_input_prompt})
    messages.append({"role": "assistant", "content": example_output_prompt})
    # example 2
    example_input_prompt = "Please generate bounding box coordinates for the following entities based on the scene description. Entities: ['a yellow bike', 'a young boy']. Scene description: a boy riding the bike"
    example_output_prompt = """Frame_1: [["a young boy, full-body view", [0.1, 0.1, 0.3, 0.3]], ["a yellow bike", [0.1, 0.2, 0.3, 0.5]]], caption: a boy is getting ready to ride the bike.\nFrame_2: [["a young boy, full-body view", [0.15, 0.15, 0.35, 0.35]], ["a yellow bike", [0.15, 0.25, 0.35, 0.65]]], caption: the boy strategically prepares to hop onto the bike.\nFrame_3: [["a young boy, full-body view", [0.2, 0.2, 0.4, 0.4]], ["a yellow bike", [0.2, 0.3, 0.4, 0.7]]], caption: the boy hops onto the bike. \nFrame_4: [["a young boy, full-body view", [0.3, 0.25, 0.5, 0.45]], ["a yellow bike", [0.25, 0.35, 0.45, 0.75]]], caption: the boy settles on the bike.\nFrame_5: [["a young boy, full-body view", [0.35, 0.3, 0.55, 0.5]], ["a yellow bike", [0.3, 0.4, 0.5, 0.8]]], caption: the boy starts to pedal furiously.\nFrame_6: [["a young boy, full-body view", [0.4, 0.35, 0.6, 0.55]], ["a yellow bike", [0.35, 0.45, 0.55, 0.85]]], caption: the boy is seen riding the bike. \nFrame_7: [["a young boy, full-body view", [0.45, 0.4, 0.65, 0.6]], ["a yellow bike", [0.4, 0.5, 0.6, 0.9]]], caption: the boy continues riding and enjoying his ride. \nFrame_8: [["a young boy, full-body view", [0.5, 0.45, 0.7, 0.65]], ["a yellow bike", [0.45, 0.55, 0.65, 0.95]]], caption: the boy is riding the bike with more enthusiasm and speed. \nFrame_9: [["a young boy, full-body view", [0.55, 0.5, 0.75, 0.7]], ["a yellow bike", [0.5, 0.6, 0.7, 1.0]]], caption: the boy riding the bike into the horizon, signaling the end of his journey.\n\nReasoning: The positions of the boy and the bike changes throughout the frames in a logical and sequential manner to depict the event of a boy riding a bike. The boy's position iteratively changes to show him getting on the bike and starting to ride. The bounding boxes for the boy and the bike are overlapped because the boy is riding the bike. The size of the bounding boxes of both the boy and the bike is kept relatively large for a clear and efficient understanding of the event. Finally, the importance level of spatial control for this prompt is slightly below average, since the bounding boxes of boy and bike don't need large movement to represent the riding motion. Therefore, the importance score for spatial control should be slightly below average.\nScore: 0.1"""
    messages.append({"role": "user", "content": example_input_prompt})
    messages.append({"role": "assistant", "content": example_output_prompt})
    # example 3
    #example_input_prompt = "Please generate bounding box coordinates for the following entities based on the scene description. Entities: ['a baby', 'a cute yellow cat']. Scene description: a cat is playing with a baby"
    #example_output_prompt = """Frame_1: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.3, 0.6, 0.7, 0.9]]], caption: the baby is sitting on the floor while the cat is cautiously approaching.\nFrame_2: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.25, 0.55, 0.65, 0.85]]], caption: the cat is getting closer to the baby.\nFrame_3: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.2, 0.5, 0.6, 0.8]]], caption: cat is now right next to the baby. \nFrame_4: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.15, 0.5, 0.55, 0.8]]], caption: cat started playing around the baby, keeping a safe distance.\nFrame_5: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.2, 0.5, 0.6, 0.8]]], caption: cat moved to the other side of the baby.\nFrame_6: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.3, 0.6, 0.7, 0.9]]], caption: cat is now jumping, entertaining the baby. \nFrame_7: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.2, 0.5, 0.6, 0.8]]], caption: the cat returned to the floor, next to the baby. \nFrame_8: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.15, 0.4, 0.55, 0.7]]], caption: cat and the baby are sitting and looking at each other. \nFrame_9: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.15, 0.45, 0.55, 0.75]]], caption: cat is now cuddling near the baby.\n\nReasoning: The above coordinates depict a sequence where a cat is playing with a baby. Frames 1-3 show the cat approaching the baby with its bounding box gradually moving towards the baby's one. In Frame 4, the cat starts playing while keeping a safe distance. Frame 5 and 6 shows the movements of the cat around the baby while entertaining her. By Frame 7, the cat returns to the floor, and Frame 8 shows a gentle interaction between both. In Frame 9, the bounding box is positioned to represent the cat cuddling the baby. The bounding boxes of the cat have been maximised within the interaction proximity. The bounding boxes of the cat and the baby overlapped because the cat touches the baby."""
    #messages.append({"role": "user", "content": example_input_prompt})
    #messages.append({"role": "assistant", "content": example_output_prompt})
    # example 4
    example_input_prompt = "Please generate bounding box coordinates for the following entities based on the scene description. Entities: ['rock concert stage', 'a man playing drum', 'a woman singing', 'a man playing guitar']. Scene description: a band playing at a rock concert"
    example_output_prompt = """Frame_1: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: guitarist and drummer preparing to play.\nFrame_2: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: guitarist is strumming and drummer begins to hit the drums.\nFrame_3: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the guitarist continues playing while the drummer beats rhythmically. \nFrame_4: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the singer enters the stage.\nFrame_5: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the singer starts singing, the band is now playing together.\nFrame_6: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the band is in full swing playing a song. \nFrame_7: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the singer hits a high note. \nFrame_8: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: guitarist plays a groundbreaking solo. \nFrame_9: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the band continues to rock the concert.\n\nReasoning: Each frame represents a different moment in the band's performance. The bounding boxes of the band members overlap with the rock stage because the members are standing on the stage. The bounding boxes provide a close-up perspective on the performing band members, giving the viewer a clear view of the band's performance. The constant location of the bounding boxes for the band members and the stage throughout the frames reflects the unchanging positions of these entities during the concert. Finally, the importance level of spatial control for this prompt is below average, since the bounding boxes of the musicians don't need large movement, and the bounding box of the stage can also be static. Therefore, the importance score for spatial control should be below average.\nScore: 0.1"""
    messages.append({"role": "user", "content": example_input_prompt})
    messages.append({"role": "assistant", "content": example_output_prompt})
    # example 5
    user_prompt = "Please generate bounding box coordinates for the following entities based on the scene description. Entities: " + str(entities) + ". Scene description: " + prompt
    messages.append({"role": "user", "content": user_prompt})
    f_gpt_create = openai.ChatCompletion.create
    n_trails = 0
    while True:
        try:
            response = f_gpt_create(
                    engine=args.deployment_name, messages=messages, max_tokens=2500, n=1)
            line_list = response.choices[0]["message"]["content"].split('\n')
            line_list = [l for l in line_list if l != '']
            pass_check = check_response_8frames(line_list)
            reasoning = line_list[-2]
            confidence = float(line_list[-1].split(":")[-1].strip())
            raw_layouts = line_list[:-2]
            parsed_layouts, parsed_captions = parse_layouts(raw_layouts)
            for i in range(len(parsed_layouts)):
                for j in range(len(parsed_layouts[i])):
                    assert type(parsed_layouts[i][j]) == list
                    assert type(parsed_layouts[i][j][0]) == str
                    assert type(parsed_layouts[i][j][1]) == list 
                    assert type(parsed_layouts[i][j][1][0]) == float 
                    assert type(parsed_layouts[i][j][1][1]) == float 
                    assert type(parsed_layouts[i][j][1][2]) == float 
                    assert type(parsed_layouts[i][j][1][3]) == float 
            break
        except openai.error.ServiceUnavailableError:
            print('OpenAI ServiceUnavailableError.\tWill try again in 5 seconds.')
            time.sleep(5)
        except openai.error.RateLimitError:
            print('OpenAI RateLimitError.\tWill try again in 5 seconds.')
            time.sleep(5)
        except openai.error.InvalidRequestError as e:
            print(e)
            print('Input too long. Will shrink the prompting examples.')
            print(args.deployment_name)
        n_trails += 1 
        print("another attempt", n_trails)
        if n_trails >= 5:
            print(f"ERROR: cannot generate layouts for prompt {prompt}")
            return 
    return parsed_layouts, parsed_captions, reasoning, confidence
    
        
def linear_interpolate_layouts(parsed_layouts, output_frames = 18):
    input_frames = len(parsed_layouts)
    frames_to_interpolate = output_frames // input_frames
    expanded_layouts = []
    idx = 0
    for i in range(len(parsed_layouts)-1):
        expanded_layouts.append(parsed_layouts[i])
        k_i0 = [e[0] for e in parsed_layouts[i]] 
        v_i0 = [e[1] for e in parsed_layouts[i]]
        objs_curr_dict = dict(zip(k_i0, v_i0))
        k_i1 = [e[0] for e in parsed_layouts[i+1]] 
        v_i1 = [e[1] for e in parsed_layouts[i+1]]
        objs_next_dict = dict(zip(k_i1, v_i1))
        for j in range(1, frames_to_interpolate):
            curr_layouts = []
            for kk, vv in zip(k_i0, v_i0): 
                if kk in k_i1:
                    curr_layouts.append([kk, ((frames_to_interpolate - j) / frames_to_interpolate * np.array(vv) \
                                            + j / frames_to_interpolate * np.array(objs_next_dict[kk])).tolist()])
                else:
                    curr_layouts.append([kk, vv])
            expanded_layouts.append(curr_layouts)
    return expanded_layouts

        
def main_query_multi_scene(args, video_id, prompt, json_data, out_file_name=None):
    print("deployment_name", args.deployment_name)
    expanded_layouts_list = []
    expanded_captions_list = []
    reasoning_list = []
    score_list = []
    all_img_list = []
    step_prompt_list = []
    n_steps = len(json_data)
    for s in tqdm(range(n_steps)):
        background = json_data[s]['environment']
        step_entities = json_data[s]['objects']
        step_prompt = json_data[s]['prompt']
        bad_count = 0
        while bad_count < 3:
            try:
                parsed_layouts, parsed_captions, reasoning, confidence = prompt_to_8frame_layouts_with_entities(args, step_prompt.replace(".", ""), step_entities)
                if args.output_frames > len(parsed_layouts):
                    for i in range(len(parsed_layouts)):
                        for j in range(len(parsed_layouts[i])):
                            parsed_layouts[i][j][0] = ''.join((x for x in parsed_layouts[i][j][0] if not x.isdigit()))
                            parsed_layouts[i][j][0] = parsed_layouts[i][j][0].replace("_", " ")
                        expanded_layouts = linear_interpolate_layouts(parsed_layouts)
                        expanded_captions = [element for element in parsed_captions for i in range(2)] 
                    _ = [e.append([background, [0.0, 0.0, 1.0, 1.0]]) for e in expanded_layouts]
                assert len(expanded_layouts) == 16, print("len(expanded_layouts) not equal to 16")    
                break
            except:
                bad_count += 1
        img_list = []
        for i in range(len(expanded_layouts)):
            layout = expanded_layouts[i]
            try:
                img = plot_bbox_from_layout(layout)
                img_list.append(img)
            except:
                pass
        expanded_layouts_list.append(expanded_layouts)
        expanded_captions_list.append(expanded_captions)
        reasoning_list.append(reasoning)
        score_list.append(confidence)
        all_img_list.append(img_list)
        step_prompt_list.append(step_prompt)
    for s in range(n_steps):
        step_prompt = step_prompt_list[s]
        if out_file_name is not None:
            prompt_out_path = os.path.join(args.output_dir, out_file_name)
            Path(prompt_out_path).mkdir(parents=True, exist_ok=True)
        else:
            prompt_out_path = os.path.join(args.output_dir, "step2_layout_generation", prompt, str(s))
            Path(prompt_out_path).mkdir(parents=True, exist_ok=True)
        img_list = all_img_list[s]
        frames = [Image.fromarray(img) for img in img_list]
        frame_one = frames[0]
        frame_one.save(os.path.join(prompt_out_path, f"{step_prompt}.gif"), format="GIF", append_images=frames, save_all=True, duration=200, loop=0)
        expanded_layouts = expanded_layouts_list[s]
        expanded_captions = expanded_captions_list[s]
        reasoning = reasoning_list[s]
        score = score_list[s]
        dict_ = {}
        dict_['layouts'] = expanded_layouts 
        dict_['captions'] = expanded_captions 
        dict_['reasoning'] = reasoning 
        dict_['score'] = score
        json_object = json.dumps(dict_, indent=4, sort_keys=True)
        with open(os.path.join(prompt_out_path, f'{step_prompt}.json'), "w") as outfile:
            outfile.write(json_object)




#%%
if __name__ == '__main__':

    parser = argparse.ArgumentParser(prog='1st scene descriptions, entity/background generation step')
    parser.add_argument("--input_json", type=str, help="a json file for single-scene prompts")
    parser.add_argument('--output_dir', type=str, help='output directory for gpt layouts')
    parser.add_argument('--output_frames', type=int, default=16)
    parser.add_argument('--gpt_type', type=str, default='gpt-4', choices=list(gpt_name.keys()))
    parser.add_argument('--range_start', type=int, default=0)
    parser.add_argument('--range_end', type=int, default=-1)
    parser.add_argument('--num_repeats', type=int, default=1)
    parser.add_argument("--existing_dir", type=str,  default=None)
    parser.add_argument("--deployment_name", type=str,  default='gpt-4')
    args = parser.parse_args()

    args.timestr = time.strftime("%Y-%m-%d_%H-%M-%S")

    if args.existing_dir is None:
        args.output_dir += f'_{args.timestr}'
    else:
        args.output_dir = args.existing_dir

    Path(args.output_dir).mkdir(parents=True, exist_ok=True)

    f = open(args.input_json)
    json_data = json.load(f)

    if args.range_end == -1:
        args.range_end = len(json_data)

    GT_video_prompts = [j['prompt'] for j in json_data]
    for i, prompt in tqdm(zip(range(len(GT_video_prompts)), GT_video_prompts)):
        if i>= args.range_start and i < args.range_end:
            step1_path = os.path.join(args.output_dir, "step1_scene_generation")
            step1_json_filepath = os.path.join(step1_path, f'{prompt}.json')
            if not os.path.exists(step1_json_filepath):
                print(prompt)
                r = 0
                bad_count = 0
                while r < args.num_repeats and bad_count < 5: 
                    try:
                        query2instruction(args, user_prompt=prompt)
                        f = open(step1_json_filepath)
                        json_data = json.load(f)
                        main_query_multi_scene(args, prompt, prompt, json_data, out_file_name=None)
                        r += 1
                        break
                    except:
                        pass
                    bad_count += 1




    

