import json
import openai
from tqdm import tqdm
from argparse import ArgumentParser
import time
import os
from get_vlm_res import Gemini, GPT, Qwen, Claude3_Opus, Claude3_Sonnet
from utils import modify_options

def retry(attempts=3):
    def decorator(func):
        def wrapper(*args, **kwargs):
            nonlocal attempts
            for i in range(attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    print(f"Attempt {i+1}/{attempts} failed: {e}")
                    if i == attempts - 1:
                        return None
                    time.sleep(1) 
        return wrapper
    return decorator

@retry(attempts=3)
def get_response(prompt, video_path, model_name, setting):
    assert setting in ['Random', 'Extracted']
    if model_name == "GPT-4V":
        answer = GPT(prompt, video_path, setting)
    elif model_name == "Gemini-Pro":
        answer  = Gemini(prompt, video_path, setting)
    elif model_name == "Claude3-Opus":
        answer = Claude3_Opus(prompt, video_path, setting)
    elif model_name == "Qwen-VL-Max":
        answer = Qwen(prompt, video_path, setting)
    else:
        answer = None
    return answer


prompt_dict = {
    "XR": """You are an AI visual assistant. Here are sequential images of Mixed-Reality combining GUI interface and real world, which are selected from a GUI video.""",
    "software": """You are an AI visual assistant. Here are sequential GUI interface images of a specific software, which are selected from a GUI video.""",
    "website": """You are an AI visual assistant. Here are sequential GUI interface images of a desktop website, which are selected from a GUI video.""",
    "mobile": """You are an AI visual assistant. Here are sequential GUI mobile interface images, which are selected from a GUI video.""",
    "multi":  """You are an AI visual assistant. Here are sequential GUI interface images of interaction among multiple softwares and websites, which are selected from a GUI video.""",
    "IOS": """You are an AI visual assistant. Here are sequential GUI IOS interface images, which are selected from a GUI video.""",
}


task_prompt = {
    "Sequential-QA": "This is a question about sequential information in sequential images.",
    "Prediction": "This is a question about predicting the next action base on the previous actions in the sequential images.",
    "Reasoning": "This is a multiple choice question with only one correct answer. This question may need multiple steps of reasoning according to the vision information in sequential images.",
    "Description1": "Please give me a detail description of these sequential images.",
    "Description2": "Offer a thorough analysis of these sequential images",
    "Caption": "Please give me a concise caption of these sequential images.",
    "static QA": "This is a question about static information such as text, icon, layout in these sequential images.",
    "MCQA": "This is a multiple choice question with only one correct answer. This question may require sequential analysis ability to the vision information in these sequential images.",
    "Conversation1": "Act as an assistant to answer the user's question in these sequential images.",
    "Conversation2": "This is a multi-turn conversation task. You will be provide the first round conversation and act as an assistant to answer the user's question in the second round according to these sequential images."
}

Notice = """You can first provide an overall description of these sequential images, and then analyze the user's question according to the sequential images and description. Finally, give an answer based on this description and the image information. 
Please format your output in a Json format, with key 'Description' for the description of these sequential images, key 'Analysis' for your analysis on the user's question and key 'Answer' for your answer to the User's question."""
def main():
    parser = ArgumentParser()
    parser.add_argument("--input", type=str, default=None)
    parser.add_argument("--output", type=str, default=None)
    parser.add_argument("--start", type=int, default=0)
    parser.add_argument("--end", type=int, default=1000)
    parser.add_argument("--model", type=str, default=None)
    parser.add_argument("--setting", type=str, default=None)
    parser.add_argument("--keyframe", type=str, default=None)
    args = parser.parse_args()
    assert args.model in ["GPT-4V", "Gemini-Pro", "Claude3-Opus", "Qwen-VL-Max", 'VideoChat2', 'ChatUnivi']
    assert args.setting in ["XR", "software", "website", "mobile", "multi", "IOS"]
    assert args.keyframe in ["Description", "Caption", "Random", "Extracted"]
    
    # Load data from JSON file
    with open(args.input, 'r') as file:
        data = [json.loads(line) for line in file]
    
    if args.end > len(data):
        args.end = len(data)
    if args.output == 'auto':
        args.output = f"output_{args.model}_{args.setting}_{args.keyframe}_{args.start}_{args.end}.jsonl"
    print(args.output)
    for item in data:
        if isinstance(item['Reasoning']['Options'], dict):
            item['Reasoning']['Options'] = modify_options(item['Reasoning']['Options'])
        if isinstance(item['MCQA']['Options'], dict):
            item['MCQA']['Options'] = modify_options(item['MCQA']['Options'])

    for item in tqdm(data[args.start:args.end], desc="Processing data"):
        new_dict = {}
        new_dict['Sequential-QA'] = {
            'q': item['Sequential-QA']['Question'],
            'a': item['Sequential-QA']['Answer']
        }
        new_dict['Prediction'] = {
            'q': item['Prediction']['Question'],
            'a': item['Prediction']['Answer']
        }
        new_dict['Reasoning'] = {
            'q': item['Reasoning']['Question'] + ' ' + item['Reasoning']['Options'][0] + ' '+ item['Reasoning']['Options'][1] + ' '+ item['Reasoning']['Options'][2] + ' '+ item['Reasoning']['Options'][3],
            'a': item['Reasoning']['Correct Answer']
        }
        new_dict['Description1'] = {
            'q': "Please provide a detailed description of what occurs throughout these sequential GUI images, focusing on the changes in the GUI elements or scenes rather than static aspects of a single frame.",
            'a': item['Description1']
        }
        new_dict['Description2'] = {
            'q': "Please provide a detailed description of what occurs throughout these sequential GUI images, focusing on the changes in the GUI elements or scenes rather than static aspects of a single frame.",
            'a': item['Description2']
        }
        new_dict['Caption'] = {
            'q': "Please give me a concise caption of these sequential GUI images, focusing on the changes in the GUI elements or scenes rather than static aspects of a single frame.",
            'a': item['Caption']
        }
        new_dict['static QA'] = {
            'q': item['static QA']['Question'],
            'a': item['static QA']['Answer']
        }
        new_dict['MCQA'] = {
            'q': item['MCQA']['Question'] + ' ' + item['Reasoning']['Options'][0] + ' '+ item['Reasoning']['Options'][1] + ' '+ item['Reasoning']['Options'][2] + ' '+ item['Reasoning']['Options'][3],
            'a': item['MCQA']['Correct Answer']
        }
        new_dict['Conversation1'] = {
            'q': "User : " + item['Conversation']['User 1'] + ' Assistant: ',
            'a': item['Conversation']['Assistant 1']
        }
        new_dict['Conversation2'] = {
            'q': "User : " + item['Conversation']['User 1'] + ' Assistant: ' + item['Conversation']['Assistant 1'] +" User: " +  item['Conversation']['User 2'] + ' Assistant: ',
            'a': item['Conversation']['Assistant 2']
        }
        for key, value in new_dict.items():
            if key not in ['Conversation1', 'Conversation2']:
                prompt = prompt_dict[args.setting] + ' ' + task_prompt[key] + Notice + f" \n Here is the question: {value['q']}\n"
            else:
                prompt = prompt_dict[args.setting] + ' ' + task_prompt[key] + Notice + f" \n {value['q']}\n"

            video_path = item['video_path']
                
            response = get_response(prompt, video_path, args.model, args.keyframe)
            if response:
                value['MLLM_result'] = response
        write_dict = {
            'id': item['id'] if 'id' in item.keys() else item['video_path'],
            'video_path': item['video_path'],
            'result': new_dict
        } 
        with open(args.output, 'a') as file:
            file.write(json.dumps(write_dict) + '\n')

    print("Done! Answers saved!")


if __name__ == "__main__":
    main()
