import argparse
import multiprocessing as mp
import os
import time
import cv2
import tqdm
import sys

from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger

sys.path.insert(0, 'models/grit_src/third_party/CenterNet2/projects/CenterNet2/')
from centernet.config import add_centernet_config
from models.grit_src.grit.config import add_grit_config

from models.grit_src.grit.predictor import VisualizationDemo
import json
from utils.util import resize_long_edge_cv2


# constants
WINDOW_NAME = "GRiT"


def dense_pred_to_caption(predictions):
    boxes = predictions["instances"].pred_boxes if predictions["instances"].has("pred_boxes") else None
    object_description = predictions["instances"].pred_object_descriptions.data
    new_caption = ""
    for i in range(len(object_description)):
        new_caption += (object_description[i] + ": " + str([int(a) for a in boxes[i].tensor.cpu().detach().numpy()[0]])) + "; "
    return new_caption

def dense_pred_to_caption_with_norm(predictions, image_width, image_height):
    boxes = predictions["instances"].pred_boxes if predictions["instances"].has("pred_boxes") else None
    object_description = predictions["instances"].pred_object_descriptions.data
    confidence = predictions["instances"].scores.data
    new_caption = ""
    for i in range(len(object_description)):
        normalized_coordinates = [float(coord) / dim for coord, dim in zip(boxes[i].tensor.cpu().detach().numpy()[0], [image_width, image_height, image_width, image_height])]
        # Format the coordinates to two decimal places
        formatted_coordinates = [format(coord, '.2f') for coord in normalized_coordinates]
        # Format the confidence to two decimal places
        formatted_confidence = format(confidence[i].item(), '.2f')
        new_caption += (object_description[i] + ": " + str(formatted_coordinates)) + ", confidence: " + str(formatted_confidence) + "; "
    # print(new_caption)
    return new_caption



def setup_cfg(args, device_id):
    cfg = get_cfg()
    if args["cpu"]:
        cfg.MODEL.DEVICE="cpu"
    else:
        cfg.MODEL.DEVICE = f"cuda:{device_id}"  # Set the device as a string in the format "cuda:<device_id>"
    add_centernet_config(cfg)
    add_grit_config(cfg)
    cfg.merge_from_file(args["config_file"])
    cfg.merge_from_list(args["opts"])
    # Set score_threshold for builtin models
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args["confidence_threshold"]
    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args["confidence_threshold"]
    if args["test_task"]:
        cfg.MODEL.TEST_TASK = args["test_task"]
    cfg.MODEL.BEAM_SIZE = 1
    cfg.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False
    cfg.USE_ACT_CHECKPOINT = False
    cfg.freeze()
    return cfg


def get_parser(device, pretrained_model_path="pretrained_models/grit_b_densecap_objectdet.pth"):
    arg_dict = {'config_file': "models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml", 'cpu': False, 'confidence_threshold': 0.5, 'test_task': 'DenseCap', 'opts': ["MODEL.WEIGHTS", pretrained_model_path]}
    if device == "cpu":
        arg_dict["cpu"] = True
    return arg_dict

def image_caption_api(image_src, device):
    args2 = get_parser(device)
    print(args2)
    cfg = setup_cfg(args2)
    demo = VisualizationDemo(cfg)
    if image_src:
        img = read_image(image_src, format="BGR")
        img = resize_long_edge_cv2(img, 384)
        predictions, visualized_output = demo.run_on_image(img)
        new_caption = dense_pred_to_caption(predictions)
    return new_caption



