
import os
import re
import json
import time
import argparse
import openai
from tqdm import tqdm 
import copy 
import json 
import torch
import numpy as np 
from torchvision.ops import box_convert
import supervision as sv
import cv2
from pathlib import Path
from PIL import Image 

from dotenv import dotenv_values
config = dotenv_values(".env")

## don't forget to set up these api details ##
openai.api_key = config['API_KEY']
openai.api_base = config['API_BASE']
openai.api_type = config['API_TYPE']
openai.api_version = config['API_VERSION']

device = "cuda"
deployment_name = 'gpt-4'
gpt_name = {
    'gpt3.5': 'text-davinci-003',
    'gpt3.5-chat': 'gpt-3.5-turbo',
    'gpt-4': 'gpt-4',
}

def annotate(img: np.ndarray, boxes: torch.Tensor, phrases, in_fmt="cxcywh", rgb_switch=True) -> np.ndarray:
    h, w, _ = img.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    xyxy = box_convert(boxes=boxes, in_fmt=in_fmt, out_fmt="xyxy").numpy()
    detections = sv.Detections(xyxy=xyxy)
    labels = [f"{phrase}: {(box).to(int).tolist()}" for phrase, box in zip(phrases, boxes)]
    box_annotator = sv.BoxAnnotator(text_padding = -12)
    annotated_frame = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    if not rgb_switch:
        annotated_frame = cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR)
    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
    return annotated_frame


def plot_bbox_from_layout(layout):
    boxes = torch.Tensor([p[1] for p in layout])
    phrases = [p[0] for p in layout]
    base_size=512, 512,3
    img = np.ones(base_size,dtype=np.uint8) * 255
    annotated_frame = annotate(img, boxes, phrases, in_fmt='xyxy')
    enlarged_size = 532, 532, 3
    enlarged=np.ones(enlarged_size,dtype=np.uint8) * 255
    enlarged[10:base_size[0]+10,10:base_size[1]+10]=annotated_frame 
    return enlarged 


def check_response_8frames(line_list):
    try:
        assert len(line_list) == 11, "ERROR: line_list should be of size 9+1=10"
        for i in range(9):
            assert line_list[i].startswith(f"Frame_{i+1}"), f"ERROR: line_list line {i} does not start with Frame_{i+1}"
            assert "caption:" in line_list[i], f"ERROR: line_list[{i}] does not contain caption"
        assert "Reasoning" in  line_list[-2], "ERROR: no reasoning in second last line of line_list"
        assert "Score" in  line_list[-1], "ERROR: no importance score in last line of line_list"
    except:
        print(line_list)
    return True 


def parse_layouts(raw_layouts):
    parsed_layouts = []
    parsed_captions = []
    for i in range(len(raw_layouts)):
        all_pairs = []
        t = copy.deepcopy(raw_layouts[i])
        t = t.replace("\'", "\"")
        parsed_captions.append(raw_layouts[i].split('caption:')[-1].strip().replace("\"", "").lower().replace(".", ""))
        while True:
            try:
                obj_layout_pair = re.search('\[\"[^"]+\", \[(\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?\]\]', t).group(0)
            except:
                break
            obj = obj_layout_pair.split("\"")[1]
            layout = re.search('\[(\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?, (\d+)(\.\d+)?\]', obj_layout_pair).group(0)
            layout = layout.replace('[', '').replace(']', '').split(',')
            layout = [l.strip() for l in layout]
            layout = [float(ele) for ele in layout]
            (layout[0] < layout[2]) == True 
            (layout[1] < layout[3]) == True 
            all_pairs.append([obj, layout])
            t = t.replace(obj_layout_pair, '')
        parsed_layouts.append(all_pairs)
    return parsed_layouts, parsed_captions 


def prompt_to_8frame_layouts(args, prompt, detailed_in_context_examples = True):
    args.gpt_name = gpt_name[args.gpt_type]
    system_prompt = "Assuming the frame size is normalized to the range 0-1, you need to give a possible 9-frame layout with bounding boxes of the listed entities of a given scene description. " \
                    "Each object in the image is one rectangle or square box in the layout and size of boxes should be as large as possible. " \
                    "You need to generate layouts from the close up camera view of the event. " \
                    "The layout difference between two adjacent frames must be small, considering the small interval. " \
                    "You also need to generate a caption that best describes the image for each frame. After generating all frames, add reasoning to your design." \
                    "Use format: Frame_1: [[object1, [left, top, right, bottom]], [object2, [left, top, right, bottom]], ..., [object_n, [left, top, right, bottom]]], caption:...\nFrame_2: [[object1, [left, top, right, bottom]], [object2, [left, top, right, bottom]], ..., [object_n, [left, top, right, bottom]]], caption:...\n...\nFrame_9: [[object1, [left, top, right, bottom]], [object2, [left, top, right, bottom]], ..., [object_n, [left, top, right, bottom]]], caption:...\nReasoning:...\nScore:...'}"
    messages = [{"role": "system", "content": system_prompt}]
    example_input_prompt = "Provide bounding box coordinates for the prompt: a cat jumps from chair to table"
    example_output_prompt = """Frame_1: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.4, 0.55, 0.6, 0.6]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: A cat is standing on the table, preparing to jump.\nFrame_2: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.45, 0.5, 0.65, 0.55]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat has just left the table and is in mid-air.\nFrame_3: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.5, 0.45, 0.7, 0.5]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat is in the middle of the jump, right between the table and the chair. \nFrame_4: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.55, 0.4, 0.75, 0.45]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat continues her flight, reaching the highest point.\nFrame_5: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.6, 0.45, 0.8, 0.5]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat starts descending towards the chair; she is still in the air.\nFrame_6: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.65, 0.5, 0.85, 0.55]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat is about to land on the chair. \nFrame_7: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.15, 0.6, 0.35, 0.65]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat lands on the chair skilfully. \nFrame_8: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.15, 0.6, 0.35, 0.65]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat sitting on the chair after the jump. \nFrame_9: [["a red dining table with round shape", [0.1, 0.6, 0.9, 1.0]], ["an orange cat", [0.15, 0.6, 0.35, 0.65]], ["a yellow wood chair", [0.05, 0.6, 0.4, 1.0]]], caption: The cat still resting on the chair, with the jump completed successfully.\n\nReasoning: The cat\'s bounding box in Frame 1 overlaps with the bounding box of the table, showing the cat in the take-off position. From Frame 2 to Frame 6, the cat is shown in various stages mid-flight, with the cat\'s bounding box moving closer to the chair with each frame. This demonstrates the cat\'s jump from the table to the chair. This jump is confirmed in Frame 7 as the cat\'s bounding box is now overlapping with the bounding box of the chair, showing that the cat has landed. Frame 8 and 9 show the cat resting on the chair after the jump. The smooth increment in the cat\'s coordinates, moving from the table to the chair, justifies the small intervals between frames. The size of the bounding boxes has been kept as large and consistent as possible for clear visualization. Finally, the spatial control for this prompt is important, since the bounding box layouts give us rich information about how the cat moves from chair to table. Therefore, the importance score for spatial control should be very high.\nScore: 0.3"""
    messages.append({"role": "user", "content": example_input_prompt})
    messages.append({"role": "assistant", "content": example_output_prompt})
    ##
    example_input_prompt = "Provide bounding box coordinates for the prompt: sandra smith in a blue dress and ed henry in a red tie sitting next to each other"
    example_output_prompt = """Frame_1: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith in a blue dress and ed henry in a red tie sitting next to each other.\nFrame_2: [["sandra smith in a blue dress, full-body view", [0.1, 0.25, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith and ed henry continue their discussion.\nFrame_3: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: ed henry makes a point while sandra listens. \nFrame_4: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith responds to ed henry's point.\nFrame_5: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: the discussion continues between sandra smith and ed henry.\nFrame_6: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra and ed henry pause to think. \nFrame_7: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: ed makes another point in the discussion. \nFrame_8: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith responds while ed henry listens. \nFrame_9: [["sandra smith in a blue dress, full-body view", [0.1, 0.2, 0.45, 0.9]], ["ed henry in a red tie, full-body view", [0.55, 0.2, 0.9, 0.9]]], caption: sandra smith and ed henry conclude their discussion next to each other.\n\nReasoning: Since Sandra Smith and Ed Henry are sitting next to each other throughout all the frames, their bounding boxes stay consistent. Close-up camera views are used to focus on both of the individuals, highlighting their reactions and interactions. This situation involves less movement, so the small interval between frames lets us capture the ongoing discussion between Sandra and Ed, each in their respective attires. Bounding boxes for the blue dress and red tie are incorporated within the Sandra's and Ed's bounding boxes. The coordination depicted through bounding boxes suggests that the spatial positioning of Sandra and Ed remains relatively unchanged across the frames. Finally, the spatial control for this prompt is not important, since sandra smith and ed henry are sitting, so their bounding box can just be static without movement. Therefore, the importance score for spatial control should be very low.\nScore: 0.0"""
    messages.append({"role": "user", "content": example_input_prompt})
    messages.append({"role": "assistant", "content": example_output_prompt})
    ##
    #example_input_prompt = "Provide bounding box coordinates for the prompt: a boy riding the bike"
    #example_output_prompt = """Frame_1: [["a young boy, full-body view", [0.1, 0.1, 0.3, 0.3]], ["a yellow bike", [0.1, 0.2, 0.3, 0.5]]], caption: a boy is getting ready to ride the bike.\nFrame_2: [["a young boy, full-body view", [0.15, 0.15, 0.35, 0.35]], ["a yellow bike", [0.15, 0.25, 0.35, 0.65]]], caption: the boy strategically prepares to hop onto the bike.\nFrame_3: [["a young boy, full-body view", [0.2, 0.2, 0.4, 0.4]], ["a yellow bike", [0.2, 0.3, 0.4, 0.7]]], caption: the boy hops onto the bike. \nFrame_4: [["a young boy, full-body view", [0.3, 0.25, 0.5, 0.45]], ["a yellow bike", [0.25, 0.35, 0.45, 0.75]]], caption: the boy settles on the bike.\nFrame_5: [["a young boy, full-body view", [0.35, 0.3, 0.55, 0.5]], ["a yellow bike", [0.3, 0.4, 0.5, 0.8]]], caption: the boy starts to pedal furiously.\nFrame_6: [["a young boy, full-body view", [0.4, 0.35, 0.6, 0.55]], ["a yellow bike", [0.35, 0.45, 0.55, 0.85]]], caption: the boy is seen riding the bike. \nFrame_7: [["a young boy, full-body view", [0.45, 0.4, 0.65, 0.6]], ["a yellow bike", [0.4, 0.5, 0.6, 0.9]]], caption: the boy continues riding and enjoying his ride. \nFrame_8: [["a young boy, full-body view", [0.5, 0.45, 0.7, 0.65]], ["a yellow bike", [0.45, 0.55, 0.65, 0.95]]], caption: the boy is riding the bike with more enthusiasm and speed. \nFrame_9: [["a young boy, full-body view", [0.55, 0.5, 0.75, 0.7]], ["a yellow bike", [0.5, 0.6, 0.7, 1.0]]], caption: the boy riding the bike into the horizon, signaling the end of his journey.\n\nReasoning: The positions of the boy and the bike changes throughout the frames in a logical and sequential manner to depict the event of a boy riding a bike. The boy's position iteratively changes to show him getting on the bike and starting to ride. The bounding boxes for the boy and the bike are overlapped because the boy is riding the bike. The size of the bounding boxes of both the boy and the bike is kept relatively large for a clear and efficient understanding of the event. Finally, the importance level of spatial control for this prompt is slightly below average, since the bounding boxes of boy and bike don't need large movement to represent the riding motion. Therefore, the importance score for spatial control should be slightly below average.\nScore: 0.1"""
    #messages.append({"role": "user", "content": example_input_prompt})
    #messages.append({"role": "assistant", "content": example_output_prompt})
    ##
    #example_input_prompt = "Provide bounding box coordinates for the prompt: a cat is playing with a baby"
    #example_output_prompt = """Frame_1: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.3, 0.6, 0.7, 0.9]]], caption: the baby is sitting on the floor while the cat is cautiously approaching.\nFrame_2: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.25, 0.55, 0.65, 0.85]]], caption: the cat is getting closer to the baby.\nFrame_3: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.2, 0.5, 0.6, 0.8]]], caption: cat is now right next to the baby. \nFrame_4: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.15, 0.5, 0.55, 0.8]]], caption: cat started playing around the baby, keeping a safe distance.\nFrame_5: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.2, 0.5, 0.6, 0.8]]], caption: cat moved to the other side of the baby.\nFrame_6: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.3, 0.6, 0.7, 0.9]]], caption: cat is now jumping, entertaining the baby. \nFrame_7: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.2, 0.5, 0.6, 0.8]]], caption: the cat returned to the floor, next to the baby. \nFrame_8: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.15, 0.4, 0.55, 0.7]]], caption: cat and the baby are sitting and looking at each other. \nFrame_9: [["a baby, full-body view", [0.1, 0.5, 0.3, 1.0]], ["a cute yellow cat", [0.15, 0.45, 0.55, 0.75]]], caption: cat is now cuddling near the baby.\n\nReasoning: The above coordinates depict a sequence where a cat is playing with a baby. Frames 1-3 show the cat approaching the baby with its bounding box gradually moving towards the baby's one. In Frame 4, the cat starts playing while keeping a safe distance. Frame 5 and 6 shows the movements of the cat around the baby while entertaining her. By Frame 7, the cat returns to the floor, and Frame 8 shows a gentle interaction between both. In Frame 9, the bounding box is positioned to represent the cat cuddling the baby. The bounding boxes of the cat have been maximised within the interaction proximity. The bounding boxes of the cat and the baby overlapped because the cat touches the baby. Finally, the importance level of spatial control for this prompt is slightly above average, since the bounding boxes of cat and boy need some level of movement to represent the boy is playing with the cat. Therefore, the importance score for spatial control should be slightly above average.\nScore: 0.2"""
    #messages.append({"role": "user", "content": example_input_prompt})
    #messages.append({"role": "assistant", "content": example_output_prompt})
    ##
    example_input_prompt = "Provide bounding box coordinates for the prompt: a band playing at a rock concert"
    example_output_prompt = """Frame_1: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: guitarist and drummer preparing to play.\nFrame_2: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: guitarist is strumming and drummer begins to hit the drums.\nFrame_3: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the guitarist continues playing while the drummer beats rhythmically. \nFrame_4: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the singer enters the stage.\nFrame_5: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the singer starts singing, the band is now playing together.\nFrame_6: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the band is in full swing playing a song. \nFrame_7: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the singer hits a high note. \nFrame_8: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: guitarist plays a groundbreaking solo. \nFrame_9: [["rock concert stage", [0.0, 0.3, 1.0, 1.0]], ["a man playing drum, full-body view", [0.35, 0.4, 0.65, 0.9]], ["a woman singing, full-body view", [0.7, 0.45, 1.0, 0.95]], ["a man playing guitar, full-body view", [0.05, 0.4, 0.3, 0.9]]], caption: the band continues to rock the concert.\n\nReasoning: Each frame represents a different moment in the band's performance. The bounding boxes of the band members overlap with the rock stage because the members are standing on the stage. The bounding boxes provide a close-up perspective on the performing band members, giving the viewer a clear view of the band's performance. The constant location of the bounding boxes for the band members and the stage throughout the frames reflects the unchanging positions of these entities during the concert. Finally, the importance level of spatial control for this prompt is below average, since the bounding boxes of the musicians don't need large movement, and the bounding box of the stage can also be static. Therefore, the importance score for spatial control should be below average.\nScore: 0.1"""
    messages.append({"role": "user", "content": example_input_prompt})
    messages.append({"role": "assistant", "content": example_output_prompt})
    ##
    user_prompt = "Provide bounding box coordinates for the prompt: " + prompt
    messages.append({"role": "user", "content": user_prompt})
    f_gpt_create = openai.ChatCompletion.create
    n_trails = 0
    while True:
        try:
            response = f_gpt_create(engine=deployment_name, messages=messages, max_tokens=2400, n=1)
            line_list = response.choices[0]["message"]["content"].split('\n')
            line_list = [l for l in line_list if l != '']
            pass_check = check_response_8frames(line_list)
            reasoning = line_list[-2]
            confidence = float(line_list[-1].split(":")[-1].strip())
            raw_layouts = line_list[:-2]
            parsed_layouts, parsed_captions = parse_layouts(raw_layouts)
            for i in range(len(parsed_layouts)):
                for j in range(len(parsed_layouts[i])):
                    assert type(parsed_layouts[i][j]) == list
                    assert type(parsed_layouts[i][j][0]) == str
                    assert type(parsed_layouts[i][j][1]) == list 
                    assert type(parsed_layouts[i][j][1][0]) == float 
                    assert type(parsed_layouts[i][j][1][1]) == float 
                    assert type(parsed_layouts[i][j][1][2]) == float 
                    assert type(parsed_layouts[i][j][1][3]) == float 
            #deployment_name = 'gpt-4'
            break
        except openai.error.ServiceUnavailableError:
            print('OpenAI ServiceUnavailableError.\tWill try again in 5 seconds.')
            time.sleep(5)
        except openai.error.RateLimitError:
            print('OpenAI RateLimitError.\tWill try again in 5 seconds.')
            time.sleep(5)
        except openai.error.InvalidRequestError as e:
            print(e)
            print('Input too long. Will shrink the prompting examples.')
            #deployment_name = 'gpt-4-32k'
            print(deployment_name)
        n_trails += 1 
        print("another attempt", n_trails)
        if n_trails >= 10:
            print(f"ERROR: cannot generate layouts for prompt {prompt}")
            return 
    return parsed_layouts, parsed_captions, reasoning, confidence 
    
        
def linear_interpolate_layouts(parsed_layouts, output_frames = 18):
    input_frames = len(parsed_layouts)
    frames_to_interpolate = output_frames // input_frames
    expanded_layouts = []
    idx = 0
    for i in range(len(parsed_layouts)-1):
        expanded_layouts.append(parsed_layouts[i])
        k_i0 = [e[0] for e in parsed_layouts[i]] 
        v_i0 = [e[1] for e in parsed_layouts[i]]
        objs_curr_dict = dict(zip(k_i0, v_i0))
        k_i1 = [e[0] for e in parsed_layouts[i+1]] 
        v_i1 = [e[1] for e in parsed_layouts[i+1]]
        objs_next_dict = dict(zip(k_i1, v_i1))
        for j in range(1, frames_to_interpolate):
            curr_layouts = []
            for kk, vv in zip(k_i0, v_i0): 
                if kk in k_i1:
                    curr_layouts.append([kk, ((frames_to_interpolate - j) / frames_to_interpolate * np.array(vv) \
                                            + j / frames_to_interpolate * np.array(objs_next_dict[kk])).tolist()])
                else:
                    curr_layouts.append([kk, vv])
            expanded_layouts.append(curr_layouts)
    return expanded_layouts


def main_query(args, video_id, prompt, out_file_name=None):
    parsed_layouts, parsed_captions, reasoning, confidence = prompt_to_8frame_layouts(args, prompt)
    if args.output_frames > len(parsed_layouts):
        for i in range(len(parsed_layouts)):
            for j in range(len(parsed_layouts[i])):
                parsed_layouts[i][j][0] = ''.join((x for x in parsed_layouts[i][j][0] if not x.isdigit()))
                parsed_layouts[i][j][0] = parsed_layouts[i][j][0].replace("_", " ")
        expanded_layouts = linear_interpolate_layouts(parsed_layouts)
        expanded_captions = [element for element in parsed_captions for i in range(2)]
    new_prompt = prompt
    assert len(expanded_layouts) == 16, print("len(expanded_layouts) not equal to 16")    
    img_list = []
    for i in range(len(expanded_layouts)):
        layout = expanded_layouts[i]
        try:
            img = plot_bbox_from_layout(layout)
            img_list.append(img)
        except:
            pass
    if out_file_name is not None:
        prompt_out_path = os.path.join(args.output_dir, out_file_name)
        Path(prompt_out_path).mkdir(parents=True, exist_ok=True)
    else:
        prompt_out_path = os.path.join(args.output_dir, new_prompt)
        Path(prompt_out_path).mkdir(parents=True, exist_ok=True)
    frames = [Image.fromarray(img) for img in img_list]
    frame_one = frames[0]
    frame_one.save(os.path.join(prompt_out_path, f"{video_id}.gif"), format="GIF", append_images=frames, save_all=True, duration=200, loop=0)
    dict_ = {}
    dict_['layouts'] = expanded_layouts 
    dict_['captions'] = expanded_captions 
    dict_['reasoning'] = reasoning 
    dict_['score'] = confidence
    json_object = json.dumps(dict_, indent=4, sort_keys=True)
    with open(os.path.join(prompt_out_path, f'{video_id}.json'), "w") as outfile:
        outfile.write(json_object)







#%%
if __name__ == '__main__':

    parser = argparse.ArgumentParser(prog='2nd layout generation step')
    parser.add_argument('--output_frames', type=int, default=16)
    parser.add_argument('--gpt_type', type=str, default='gpt-4', choices=list(gpt_name.keys()))
    parser.add_argument('--range_start', type=int, default=0)
    parser.add_argument('--range_end', type=int, default=-1)
    parser.add_argument('--num_repeats', type=int, default=5)
    parser.add_argument("--input_json", type=str, help="a json file for single-scene prompts")
    parser.add_argument('--output_dir', type=str, help='output directory for gpt layouts')
    args = parser.parse_args()


    f = open(args.input_json)
    json_data = json.load(f)

    if args.range_end == -1:
        args.range_end = len(json_data)


    os.makedirs(args.output_dir, exist_ok=True)


    failed_prompt_list = []
    for i, prompt in tqdm(enumerate(list(json_data.values()))):
        if i>= args.range_start and i < args.range_end:
            r = 0
            bad_count = 0
            while r < args.num_repeats and bad_count < 5: 
                try:
                    main_query(args, r, prompt, out_file_name=None)
                    r += 1
                    break
                except:
                    pass
                bad_count += 1
            
