import argparse
import torch

from LLaVA.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from LLaVA.llava.conversation import conv_templates, SeparatorStyle
from LLaVA.llava.model.builder import load_pretrained_model
from LLaVA.llava.utils import disable_torch_init
from LLaVA.llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path

import requests
from PIL import Image
from io import BytesIO
from transformers import TextStreamer
import json, os
from tqdm import tqdm
import time

from detector import Detector
from retriever import ClipRetriever

def load_image(image_file):
    if image_file.startswith('http://') or image_file.startswith('https://'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    return image


def main(args):
    # Model
    disable_torch_init()

    model_name = get_model_name_from_path(args.model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)

    if "phi" in model_name.lower():
        conv_mode = "phi3_instruct"
    elif "v1" in model_name.lower():
        conv_mode = "llava_v1"    
    else:
        conv_mode = "llava_v0"

    if args.conv_mode is not None and conv_mode != args.conv_mode:
        print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
    else:
        args.conv_mode = conv_mode

    conv = conv_templates[args.conv_mode].copy()
        
    with open(args.eval_file, "r") as f:
        test_set = json.load(f)
    
    results = []

    if args.retrieval:    
        detector = Detector()
        with open("benchmark_data/yollava-data/datastore/database.json", "r") as f:
            database = json.load(f)
        

    
    for name, samples in tqdm(test_set.items()):
        for Qid, sample in samples.items():
            conv = conv_templates[args.conv_mode].copy()
            
            image = None
            image_sizes = []
            images = []
            
            rag_images = dict()
            if args.retrieval:
                concept = f"<{name}>"
                text_img_path = database["concept_dict"][concept]["image"]
                rag_images[text_img_path] = 0

                extra_info = ""
                for i, ret_path in enumerate(rag_images):
                    img = load_image(ret_path)
                    image_sizes.append(img.size)
                    images.append(img)
                    tag = database["path_to_concept"][ret_path]
                    name = database["concept_dict"][tag]["name"]
                    info = database["concept_dict"][tag]["info"]
                    extra_info += f"{i+1}.<image>\n Name: <{name}>, Info: {info}\n"
            image_tensor = None
            if len(images) > 0:
                image_tensor = process_images(images, image_processor, model.config)
            
                if type(image_tensor) is list:
                    image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
                else:
                    image_tensor = image_tensor.to(model.device, dtype=torch.float16)
            sample["question"] = sample["question"].replace("sks", f"<{name}>")
            q = "Question: " + sample["question"] + "\nOptions:\n"
            
            for option, content in sample["option"].items():
                q += f"{option}. {content}\n"
            
            q += 'Please give a single character from the options above. \n'

            if image is not None:
                if args.retrieval:
                    Query = f"<image>\n{q}[{extra_info}]"
                else:
                    Query = f"<image>\n{q}"

                image = None
            else:
                if args.retrieval:
                    Query = f"[{extra_info}]{q}"
                    Query = f"{extra_info}{q}"
                    
                else:
                    Query = q 
                
            conv.append_message(conv.roles[0], Query)    
            conv.append_message(conv.roles[1], None)
            prompt = conv.get_prompt()

            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
            keywords = [stop_str]
            streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

            with torch.inference_mode():
                output_ids = model.generate(
                    input_ids,
                    images=image_tensor,
                    image_sizes=image_sizes,
                    do_sample=True if args.temperature > 0 else False,
                    temperature=args.temperature,
                    max_new_tokens=args.max_new_tokens,
                    streamer=streamer,
                    use_cache=True)
            
            outputs = tokenizer.decode(output_ids[0]).replace("<s>", "").replace("</s>", "").strip()
            
            conv.messages[-1][-1] = outputs
            results.append({"img_path": Qid, "question": q, "option": sample["option"], "answer": outputs, "correct_answer": sample["correct_answer"]})
            
            if args.debug:
                print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
                
    save_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
    
    with open(f"results/QA/text/results-{save_time}.json", "w") as f:
        json.dump(results, f, indent = 4)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.5-7b")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--device", type=str, default="cuda")
    parser.add_argument("--conv-mode", type=str, default=None)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--max-new-tokens", type=int, default=512)
    parser.add_argument("--load-8bit", action="store_true")
    parser.add_argument("--load-4bit", action="store_true")
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("--eval-file", type=str, required=True)
    parser.add_argument("--retrieval", action="store_true")
    parser.add_argument("--index-path", type=str, default=None)

    args = parser.parse_args()
    main(args)