import pathlib
import google.generativeai as genai
import tiktoken
import textwrap
import openai
import os
from PIL import Image
import base64
import time
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise ValueError("No OpenAI API key found. Please set the OPENAI_API_KEY.")

# input_txt_path = "../results/clustering/driving/clusters_prompt_ensemble_o4-mini/aggregated_clusters_text.txt"
input_txt_path = "../results/clustering/waypointnav/clusters_prompt_ensemble_o4-mini/aggregated_clusters_text.txt"

output_filepath = input_txt_path.replace(".txt", ".jsonl")

with open(input_txt_path, "r") as f:
    file_content = f.read()

prompt = f"""
Convert the following text into a proper JSON format following this exact structure for each cluster:
{{
  "cluster_name": "Name of the cluster",
  "occurrence": "Percentage or frequency information if available",
  "keywords": [
    "keyword1",
    "keyword2",
    "additional keywords..."
  ],
  "notes": "Any additional information or descriptions about the cluster"
}}

Make each cluster a separate JSON object on a new line (JSONL format).
Parse the text to identify cluster names, keywords, occurrence information, and any descriptive notes.

Here's the text to convert:

{file_content}
"""

# 1) Define Pydantic models
class Cluster(BaseModel):
    cluster_name: str = Field(..., description="Name of the cluster")
    occurrence: str = Field(..., description="Percentage or frequency information if available")
    keywords: List[str] = Field(..., description="A list of keywords associated with the cluster")
    notes: str = Field(..., description="Any additional information or descriptions about the cluster")

class ClustersResponse(BaseModel):
    clusters: List[Cluster]

# 2) Build a functions schema from Pydantic
fn_def = {
    "name": "parse_clusters",
    "description": "Extract all clusters as a JSON array",
    "parameters": ClustersResponse.model_json_schema()  # use V2 API
}

# 3) Call the model
response = openai.ChatCompletion.create(
    model="o4-mini",
    messages=[{"role": "user", "content": prompt}],
    functions=[fn_def],
    function_call={"name": "parse_clusters"}
)

# 4) Validate & parse with Pydantic
message = response.choices[0].message
args_json = message["function_call"]["arguments"]
clusters_resp = ClustersResponse.model_validate_json(args_json)  # V2 replacement
clusters = clusters_resp.clusters  # List[Cluster]

# 5) Write out JSONL
with open(output_filepath, "w") as f:
    for cluster in clusters:
        f.write(cluster.model_dump_json(exclude_none=True) + "\n")  # V2 replacement

print(f"Output saved to {output_filepath}")