import pathlib
import google.generativeai as genai
import json
import tiktoken
import textwrap
import openai
import os
from PIL import Image
import base64
import time
from dotenv import load_dotenv


# Get API key from environment variable
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise ValueError("No OpenAI API key found. Please set the OPENAI_API_KEY.")

def get_descriptions_from_jsonl(file_path, include_trajectory=False):
    descriptions = []
    with open(file_path, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            if isinstance(data, dict) and 'failure_reason' in data:
                if include_trajectory and 'trajectory' in data:
                    descriptions.append(f"Trajectory description: {data['trajectory']} Failure reason: {data['failure_reason']}")
                else:
                    descriptions.append(data['failure_reason'])
    descriptions = "\n".join(descriptions)
    return descriptions


# jsonl_file_path = "../results/clustering/driving/failure_description_resoning_gemini25pro_combined.jsonl"
jsonl_file_path = "../results/clustering/waypointnav/failure_description_resoning_gemini25pro_combined.jsonl"

n_prompts = 4
model_name = "o4-mini"

# get the directory of the jsonl file
jsonl_file_dir = os.path.dirname(jsonl_file_path)
output_path = os.path.join(jsonl_file_dir, f"clusters_prompt_ensemble_{model_name}")
os.makedirs(output_path, exist_ok=True)

# Get descriptions from the JSONL file
descriptions = get_descriptions_from_jsonl(jsonl_file_path, include_trajectory=False)  # Set to True when needed


prompts = {
    'waypoint': [
        "These are semantic failure reasons of a robot navigating indoors based on images that fails due to collision. Generate cluster centers based on the types of visual semantic failures present so that these reasons can be assigned to those clusters. Return the cluster names and the list of characteristics, keywords which belong to each cluster. Make sure to include long tail/rare clusters. Report the occurance frequency of each cluster.\n\n" + descriptions,
        
        "You are an expert in robotic vision failure analysis. Below is a list of semantic failure reasons for an indoor robot navigation system that leads to collisions. "
        "Your tasks:\n"
        "  1. Identify distinct cluster centers representing each type of visual semantic failure.\n"
        "  2. Assign each failure reason to the appropriate cluster.\n"
        "  3. Include long‑tail/rare clusters as separate entries.\n"
        "  4. For each cluster, report:\n"
        "     • cluster_name\n"
        "     • defining keywords or traits\n"
        "     • occurrence_frequency\n"
        "     • example descriptions (up to 3)\n"
        "  5. Present the output as a JSON array of objects with fields "
        "'cluster_name', 'keywords', and 'frequency'.\n\n"
        + descriptions,
        
        "Act as a taxonomy engineer analyzing semantic failure reasons of an indoor vision‑based robot that collides. Given the following descriptions, "
        "perform these steps:\n"
        "  • Group reasons into clusters based on shared semantic features.\n"
        "  • Capture both common patterns and rare/long‑tail failure types.\n"
        "  • For each cluster, provide:\n"
        "      – name (a concise label)\n"
        "      – terms (list of characteristic keywords)\n"
        "      – count (number of examples in that cluster)\n"
        "      – failure modes\n"
        "  • Output the final result as valid JSON: an array of objects with keys 'name', 'terms', and 'count'.\n\n"
        + descriptions,
        
        "You are a domain expert in robotic vision failure analysis. Given a list of semantic failure reasons for an indoor navigation robot that lead to collisions, perform the following steps:\n"
        "1. Identify and define distinct clusters of semantic failure types, including both common and long‑tail/rare cases.\n"
        "2. For each cluster, provide:\n"
        "   • cluster_name: a concise, descriptive label\n"
        "   • keywords: list of characteristic terms or phrases\n"
        "   • frequency: count or percentage of occurrences in the input\n"
        "   • failure modes: list of specific failure modes or examples\n"
        "3. Assign each failure description to its appropriate cluster.\n"
        "4. Output the result as a JSON array of objects with fields 'cluster_name', 'keywords', and 'frequency'.\n\n"
        + descriptions
    ],
    
    'driving': [
        "These are semantic failure reasons for different trajectories of a car. Your job is to analyze all of them and come up with clusters of different semantic failure reasons. Generate cluster centers based on the types of visual semantic failures present so that these reasons can be assigned to those clusters. Return the cluster names and the list of characteristics, keywords which belong to each cluster. Make sure to include long tail/rare clusters. Report the occurance frequency of each cluster.\n\n" + descriptions,
        
        "You are a domain expert in automotive collision analysis. Given a list of semantic failure reasons for car trajectories that resulted in crashes, perform the following steps:\n"
        "1. Identify and define distinct clusters of semantic failure types, covering both common incidents and long‑tail/rare scenarios.\n"
        "2. For each cluster, provide:\n"
        "   • cluster_name: a concise, descriptive label\n"
        "   • keywords: a list of characteristic terms or phrases\n"
        "   • frequency: the count or percentage of occurrences in the input\n"
        "   • failure modes: a list of specific failure modes or examples\n"
        "3. Assign each failure reason to its corresponding cluster.\n"
        "4. Output the final result as a JSON array of objects with keys 'cluster_name', 'keywords', and 'frequency'.\n\n"
        + descriptions,
        
        "You are an expert in automotive semantic failure classification. Given a list of trajectory failure reasons that resulted in car crashes, perform the following:\n"
        "1. Identify distinct clusters of semantic failure types, including both frequent and long‑tail/rare cases.\n"
        "2. For each cluster, define:\n"
        "   • cluster_name: concise label\n"
        "   • keywords: list of representative terms\n"
        "   • count: number of occurrences\n"
        "   • failure modes: specific examples\n"
        "3. Assign each failure reason to one of the clusters.\n"
        "4. Output a single JSON object with two keys:\n"
        "   • clusters: an array of cluster objects ({'cluster_name','keywords','count'})\n"
        "   • assignments: a mapping from each input reason to its cluster_name\n\n"
        + descriptions,
        
        "You are an AI‑driven taxonomy engineer for car collision analysis. Given semantic failure descriptions of trajectories that ended in crashes:\n"
        "- Group descriptions into semantically coherent clusters (include rare edge‑cases).\n"
        "- For each cluster, provide:\n"
        "  • Name (short label)\n"
        "  • Key characteristics (list of keywords)\n"
        "  • Example descriptions (up to 3 representative samples)\n"
        "  • Frequency (%) of total\n\n"
        "Present your results as a Markdown table with columns: Cluster Name | Keywords | Examples | Frequency\n\n"
        + descriptions
    ]
}


# Select prompt category based on the file path
category = 'waypoint' if "way" in jsonl_file_path else 'driving'

for i in range(n_prompts):
    output_filepath = os.path.join(output_path, f"gpt{model_name}_clusters_longtail_freq_{i}.txt")
    
    prompt = prompts[category][i]
    
    content = []
    content.append({
        "type": "text",
        "text": prompt
    })
    messages = [{"role": "user", "content": content}]
    response = openai.ChatCompletion.create(
        model = model_name,
        # reasoning={"effort": "high"},
        messages=messages
        # max_tokens=512  # DONT SPECIFY FOR O-SERIES MODELS
    )
    output = response['choices'][0]['message']['content']


    # Save the output to a text file
    with open(output_filepath, 'w') as f:
        f.write(output)

    # Print to console for immediate feedback
    print(f"Output saved to {output_filepath}")
    print(output)