# ---------------------------------------------------------------------
# Generate the evaluation dataset based on the nuScenes image dataset
# ---------------------------------------------------------------------


import os
import random

import cv2
from tqdm import tqdm

from common.prompt import TRAIN_SYSTEM_PROMPT
from common.request import VLMAgent, NPImageEncode
from common.utils import convert_text_to_json, wrap_json, save_jsonl

NAVIGATION = [
    'Stay in the current lane and straight ahead.',
    'Turn left at the next intersection.',
    'Turn right at the next intersection.',
    'Ready to stop at the next intersection.',
    'Keep speed to 40 mph if possible.'
]

SYSTEM_PROMPT = """
You are GPT-4V(ision), a comprehensive multi-modal model developed by OpenAI, now functioning as an autonomous driving agent. Your role is to provide detailed, informative, and actionable descriptions for human drivers navigating complex scenarios. You will receive images from the vehicle's onboard camera and are tasked with interpreting these images for decision-making. Based on the visual data, you will convert the scenes into natural language descriptions, reason about the current driving situation, and decide on the appropriate maneuver from the following options:

Action Descriptions:
- Turn-left: Move to the left lane or make a left turn
- IDLE: Maintain current lane and speed
- Turn-right: Move to the right lane or make a right turn
- Acceleration: Increase the vehicle's speed
- Deceleration: Reduce the vehicle's speed

In making your decision, you must consider additional metadata provided by the driver, such as the vehicle's current speed and navigation directives. Your output should consist of a structured response in JSON format containing the metadata, a description of the scene, your reasoning for the suggested action, and the final decision. Here is the structure you should follow:

{
    "meta": "metadata provided by the driver",
    "description": "Your description of the image from the front-view camera",
    "reasoning": "Your reasoning for the action plan",
    "decision": "Your final action choice (e.g., Turn-left, IDLE, Turn-right, Acceleration, Deceleration)"
}

Example-1:
{
    "meta": "The current speed of the ego vehicle is 15 mph, with navigation instructions: Stay in the current lane and straight ahead.",
    "description": "The image captures an urban road scene from the front camera of a vehicle. It shows a two-lane street bordered by orange barriers and dense foliage on the left. The traffic light ahead is red, signaling the vehicle to stop. The road surface is dry, and visibility is good. There are no pedestrians or other obstructions visible in the crosswalk. Another vehicle is seen ahead, positioned correctly within the driving lane.",
    "reasoning": "Given the red traffic light and the directive to continue straight in the current lane, the vehicle must come to a stop to comply with traffic laws and ensure safety. There is no immediate need to change lanes or adjust the path as the road ahead is clear, except for the traffic signal.",
    "decision": "Deceleration"
}

Example-2:
{
    "meta": "The current speed of the ego vehicle is 21 mph, with navigation instructions: Turn right at the next intersection.",
    "description": "The front camera view shows a clear, sunny day on a wide urban road. The road is marked with a pedestrian crossing in the foreground and a central median further ahead. The road features two lanes in the current direction, with the ego vehicle positioned in the left lane. There are several parked cars on both sides of the street, but the right lane appears clear of moving traffic, making it suitable for a lane change. No pedestrians or other immediate obstacles are visible in the vehicle’s path.",
    "reasoning": "With the instruction to turn right and considering the vehicle's speed, it is essential to prepare for a safe lane change to the right. The absence of traffic in the right lane and clear road conditions ahead make this maneuver feasible and safe to execute at the current speed, ensuring that the vehicle aligns correctly for the upcoming right turn.",
    "decision": "Turn-right"
}

Example-3:
{
    "meta": "The current speed of the ego vehicle is 30 mph, with navigation instructions: Turn left at the next intersection.",
    "description": "The image displays a rainy day in an urban setting, taken from the vehicle’s front camera. The road is slick with rain, causing reflections of traffic lights and vehicles, which suggests slippery driving conditions. The street is busy with various vehicles, including buses and cars, indicating moderate traffic flow. The roadway features multiple lanes and a distinct bike lane on the right. Traffic signals are visible and currently green. The environment includes urban buildings and signage for local destinations, under a heavily overcast sky.",
    "reasoning": "Considering the high speed of the ego vehicle and the wet road conditions, it is crucial to reduce speed and increase following distance to prepare safely for a left turn at the next intersection. The presence of other vehicles and the green traffic light allow movement, but caution is advised due to the inclement weather, which can affect visibility and road grip. Anticipating the left turn, the vehicle should position itself in the appropriate lane well ahead of the intersection.",
    "decision": "Deceleration"
}

Your responses should be diverse, informative, and precise, considering both safety and efficiency in your decision-making process."""

INPUT_FORMAT = """Meta Information: {meta}\nDescription: {description}"""

OUTPUT_FORMAT = """
{{
    "reasoning": "{0}",
    "decision": "{1}"
}}
"""


def generate_evaluation_dataset(agent, root, filelist):
    """Generate the evaluation dataset based on the nuScenes image dataset
    """
    with open(filelist, 'r') as f:
        filenames = f.readlines()
        filenames = [filename.strip() for filename in filenames]

    # TODO: hard code the number of data and split train and val
    # filenames = filenames[: 50]   # benign eval data
    # filenames = filenames[-100:]  # benign training data
    filenames = filenames[150:200]  # base dataset for word trigger
    eval_dataset = []

    for filename in tqdm(filenames):
        img_path = os.path.join(root, filename)
        img = cv2.imread(img_path)

        success = False
        count = 0
        ans = None
        while not success:
            agent.addTextPrompt(SYSTEM_PROMPT)
            agent.addTextPrompt(
                f'The current speed of ego vehicle is {random.randint(5, 30)} mph. The navigation information is {random.choice(NAVIGATION)}\n')
            agent.addTextPrompt('The image is captured by the front cameras at current time step.')
            agent.addImageBase64(NPImageEncode(img))
            agent.addTextPrompt('Now, begin.')

            # get the decision made by the driver agent
            (
                ans,
                prompt_tokens, completion_tokens,
                total_tokens, timecost
            ) = agent.convert_image_to_language()

            if ans is None:
                continue

            ans = convert_text_to_json(ans)
            # wrong output format
            if ans is None:
                print(f"Failed to convert the text to JSON format. Try again.")
                count += 1
                continue
            else:
                success = True
            # max retry
            if count >= 10:
                print(f"Failed to convert the text to JSON format for 5 times. Skip this image.")
                break

        eval_dataset.append({
            'messages': wrap_json(ans, TRAIN_SYSTEM_PROMPT, INPUT_FORMAT, OUTPUT_FORMAT)
        })
    # TODO: hard code the saving path
    save_jsonl(eval_dataset, 'nusc/nusc_defense_dataset.jsonl')


if __name__ == '__main__':
    root = './data/nuscenes'
    filelist = './nusc/nusc_filelist.txt'
    agent = VLMAgent(max_tokens=4096)
    generate_evaluation_dataset(agent, root, filelist)
