import os
import torch
from diffusers import StableDiffusionGLIGENPipeline
from diffusers.utils import load_image
import argparse
import pandas as pd
import json
from tqdm import tqdm



def main(arg):
    from accelerate.utils import set_seed
    set_seed(0)
    
    cuda_number = arg.cuda
    if cuda_number == -1:
        cur_device = 'cpu'
    else:
        if torch.cuda.is_available():
            cur_device = "cuda:" + str(cuda_number)
        elif torch.backends.mps.is_available():
            cur_device = "mps"
        else:
            cur_device = "cpu"

    if not os.path.exists(arg.output_dir):
        os.makedirs(arg.output_dir)

    # Generate an image described by the prompt and
    # insert objects described by text at the region defined by bounding boxes
    pipe = StableDiffusionGLIGENPipeline.from_pretrained(
        "masterful/gligen-1-4-generation-text-box", variant="fp16", torch_dtype=torch.float16, safety_checker=None
    )
    pipe = pipe.to(cur_device)
    pipe.set_progress_bar_config(disable=True)


    with open(arg.info_dir, "r") as file:
        eval_datum = json.load(file)["data"]
    
    map_direction = {}
    for eval_data in tqdm(eval_datum):
        context = eval_data["prompt"]
        id_data = eval_data["id"]
        save_file = f"{id_data}.png"
        layouts = eval_data["llm_layout_suggestions"]
        boxes = []
        phrases = []
        if layouts: # If layout generated correctly, use the layout information
            for layout in layouts:
                # print(layout)
                # layout = eval(layout)

                phrase = " ".join(layout[0].split()[:-1])
                box = layout[1]
                reference_obj_dir = layout[-1]
                if len(box) != 4:
                    continue
                if reference_obj_dir is not None:
                    if "front" in reference_obj_dir or "forward" in reference_obj_dir:
                        phrase = phrase + " that is facing toward"
                    elif "back" in reference_obj_dir:
                        phrase = phrase + " that is facing backward"
                    elif "left" in reference_obj_dir:
                        phrase = phrase + " that is facing to the left"
                    elif  "right" in reference_obj_dir:
                        phrase = phrase + " that is facing to the right"
                        
                xmin = box[0]
                ymin = box[1]
                xmax = min(xmin + box[2], 1.0)
                ymax = min(ymin + box[3], 1.0)
                box = [xmin, ymin, xmax, ymax]
                boxes.append(box)
                phrases.append(phrase)

        images = pipe(prompt=context,
                            gligen_phrases=phrases if phrases else [""],
                            gligen_boxes=boxes if boxes else [[0, 0, 0, 0]],
                            output_type="pil",
                            gligen_scheduled_sampling_beta = 0.3, 
                            num_inference_steps = 50
                            ).images
        
        images[0].save(os.path.join(arg.output_dir, save_file))
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", type=int, default=7)
    parser.add_argument("--few_shot_layout", type=int, default=4)
    parser.add_argument("--info_dir", type=str, default="FILL YOUR DATASET HERE")
    parser.add_argument("--output_dir", type=str, default="FILL YOUR OUTPUT DIRECTORY HERE")
    parser.add_argument("--direction", type=bool, default=False)
    parser.add_argument("--llama_size", type=str, default="8B")
    parser.add_argument("--num_repeat", type=int, default=1)

    arg = parser.parse_args()
    main(arg)
