import os
import json
import pandas as pd
from openai import OpenAI
from prompts import CHALLENGE_PROMPTS

MODEL_NAME = "gpt-4o"

TEMPERATURE = 1

model = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
    organization=os.environ.get("OPENAI_ORGANIZATION")
)

CONV_LEN_CATEGORIES = {
    "short": "Generate examples where the number of utterances (by USER or AGENT) is up to 5",
    "medium": "Generate examples where the number of utterances (by USER or AGENT) is minimum 6, up to 10",
    "long": "Generate examples where the number of utterances (by USER or AGENT) is minimum 10 up to 20"
}

TOPICS = [
    "cooking",
    "programming",
    "flights",
    "restaurants",
    "health"
]

def generate_prompt(conv_len: str, topic: str) -> str:
    '''
    Generate prompt to generate conversation across categories: length, challenge, topic
    '''
    conv_len_prompt = CONV_LEN_CATEGORIES[conv_len]
    challenge_instructions = "\n".join([
        f"- {ch}: {desc}" for ch, desc in CHALLENGE_PROMPTS.items()
    ])

    return (
    f"Generate a conversation between a USER and an AGENT on the topic: {topic}.\n\n"
    f"The USER begins with a task-oriented query. The AGENT only asks clarifying or follow-up questions to understand the USER's intent and constraints. It must not solve the task.\n\n"
    f"The conversation should be {conv_len} turns ({conv_len_prompt}), stay on-topic, and be coherent.\n\n"
    f"Each conversation must end with a USER statement (not a question), and no utterance should include unrelated or off-topic remarks.\n\n"
    f"The challenge types are:\n"
    f"{challenge_instructions}\n\n"
    f"Output a single JSON object with challenge names as keys and conversations as values.\n"
    f"Each conversation is a list of strings starting with 'USER:' or 'AGENT:'.\n"
)

def query_llm(prompt: str):
    '''
    Prompt LLM
    '''
    model_config = {
        "model": MODEL_NAME,
        "messages": [{"role": "system", "content": "Output strictly in valid JSON format. Ensure no errors whatsoever."},{"role": "user", "content": prompt}],
        "temperature": TEMPERATURE,
        "response_format": {"type": "json_object"}
    }
    response = model.chat.completions.create(**model_config)
    return response.choices[0].message.content

def generate_data_for_conv_len(conv_len: str, topic: str):
    '''
    Helper code to generate synthetic conversations
    '''
    prompt = generate_prompt(conv_len, topic)
    output = query_llm(prompt)
    data = json.loads(output)
    records = []
    for challenge, convo in data.items():
        records.append({
            "conv_len": conv_len,
            "topic": topic,
            "challenge": challenge,
            "conversation": convo
        })
    return records

if __name__ == "__main__":
    all_records = []
    conv_lens = list(CONV_LEN_CATEGORIES.keys())
    num_examples_per_len = 10 # set number of examples per len eg. 10
    conv_id_counter = 0  

    for conv_len in conv_lens:
        for i in range(num_examples_per_len):
            topic = TOPICS[i % len(TOPICS)]
            print(f"Generating [{conv_len}] example {i+1}/{num_examples_per_len} | topic: {topic}")
            records = generate_data_for_conv_len(conv_len, topic)
            
            for r in records:
                r["conv_id"] = conv_id_counter
                conv_id_counter += 1
                all_records.append(r)

    df = pd.DataFrame(all_records)
    df.to_json("synthetic_conversations.json", orient="records", indent=2)
    print(f"\nSaved {len(df)} conversations to synthetic_conversations.json")