import pandas as pd
import json
from tqdm import tqdm
from openai import OpenAI
import re

# === API Clients (keys removed for anonymization) ===
client = OpenAI(api_key="sk-...REDACTED...")
openRouterClient = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="sk-...REDACTED...",
)

# === Prompt and Config Templates ===
system_prompt = """
You are an expert in patent comprehension. Your task is to generate structured questions that assess a reader’s background knowledge necessary to understand a given patent abstract. The questions should focus on foundational concepts, principles, and applications relevant to the patent’s domain without explicitly referencing the patent itself. Ensure that the questions are framed generally and test for domain knowledge rather than details specific to the patent.

The questions should follow Bloom's Taxonomy and be categorized into three levels:

1. Remembering: Questions that assess the reader’s ability to recall key technical concepts, definitions, and fundamental principles relevant to the patent’s domain.
2. Understanding: Questions that assess the reader’s ability to explain how different elements of similar technologies function and interact.
3. Applying: Questions that evaluate the reader’s ability to apply their knowledge by solving problems, making predictions, or considering real-world applications.

Do not directly reference the patent abstract in any question. Instead, ensure all questions to test knowledge that would be useful for understanding the patent without explicitly addressing its content.
"""

promptQ1 = """
You are given the following patent:

Patent Title: {patent_title}

Patent Abstract: {patent_summary}

Generate a set of {qnum} questions that test a reader’s ability to recall fundamental concepts, key terms, and basic components necessary to understand the given patent abstract.

### **Output Format (JSON):**
{{
   "1": "question1",
   "2": "question2",
   "3": "question3"
}}
"""

promptQ2 = """
You are given the following patent:

Patent Title: {patent_title}

Patent Abstract: {patent_summary}

Generate a set of {qnum} questions that assess a reader’s comprehension of the given patent abstract by requiring them to explain how different elements of the invention work together.

### **Output Format (JSON):**
{{
   "1": "question1",
   "2": "question2",
   "3": "question3"
}}
"""

# === Load Sample Data ===
abstract_df_sample = pd.read_csv("/path/to/...") #path to the actual patent pair data set

# Patent column mappings
patent_config = {
    "patent_id": "patent_id",
    "patent_abstract": "patent_abstract",
    "patent_title": "patent_title",
}

def openRouterChat(prompt, client):
    try:
        completion = client.chat.completions.create(
            extra_headers={"X-Title": "llm_judgement_votes"},
            model="meta-llama/llama-3.1-8b-instruct", #replace with other models here
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            max_completion_tokens=5000
        )
        response_text = completion.choices[0].message.content.strip()
        json_matches = re.findall(r'\{.*?\}', response_text, re.DOTALL)
        if json_matches:
            return json.loads(json_matches[0])
        print(f"[ERROR] No valid JSON found in response:\n{response_text}")
        return None
    except Exception as e:
        print(f"[ERROR] OpenRouter API call failed: {e}")
        return None

def generate_and_save_questions_v2(cs_sampled, prompt_template, client, output_csv="GeneratedQuestions.csv"):
    new_rows = []
    with tqdm(total=len(cs_sampled), desc="Generating Research Questions") as pbar:
        for patent in cs_sampled.itertuples(index=False):
            patent_data = patent._asdict()
            patent_id = patent_data[patent_config["patent_id"]]
            abstract = patent_data[patent_config["patent_abstract"]]
            title = patent_data[patent_config["patent_title"]]
            attempt = 0
            success = False
            while attempt < 10 and not success:
                try:
                    prompt = prompt_template.format(patent_summary=abstract, patent_title=title, qnum=3)
                    questions = openRouterChat(prompt, client)
                    if not isinstance(questions, dict) or len(questions) != 3:
                        raise ValueError("Expected exactly 3 questions in dictionary format.")
                    for qIdx, question in questions.items():
                        new_row = patent_data.copy()
                        new_row["QuestionId"] = qIdx
                        new_row["Question"] = question
                        new_rows.append(new_row)
                    success = True
                except (json.JSONDecodeError, ValueError) as e:
                    attempt += 1
                    if attempt == 10:
                        print(f"Failed to process patent {patent_id} after 10 attempts. Error: {e}")
                pbar.update(1)
    df = pd.DataFrame(new_rows)
    df.to_csv(output_csv, index=False, encoding='utf-8')
    print(f"Updated dataset with questions saved successfully to {output_csv}")
    return output_csv

# === Main Run ===
FOLDER = "/path/to/save/results/"
prompts = {"promptQ1": promptQ1, "promptQ2": promptQ2}
for key, prompt in prompts.items():
    generate_and_save_questions_v2(
        cs_sampled=abstract_df_sample,
        prompt_template=prompt,
        client=openRouterClient,
        output_csv=f"{FOLDER}Generated_Questions_{key}.csv"
    )