import openai
import os
import PyPDF2
import json
import re
import requests
from io import BytesIO
import time

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)

def create_file(client, file_path):
    if file_path.startswith("http://") or file_path.startswith("https://"):
        # Download the file content from the URL
        response = requests.get(file_path)
        file_content = BytesIO(response.content)
        file_name = file_path.split("/")[-1]
        file_tuple = (file_name, file_content)
        result = client.files.create(
            file=file_tuple,
            purpose="assistants"
        )
    else:
        # Handle local file path
        with open(file_path, "rb") as file_content:
            result = client.files.create(
                file=file_content,
                purpose="assistants"
            )
    print(result.id)
    return result.id

def store_a_file(file_path):
    vector_store = client.vector_stores.create(
        name="knowledge_base"
    )
    file_id = create_file(client, file_path)
    client.vector_stores.files.create(
        vector_store_id=vector_store.id,
        file_id=file_id
    )
    result = client.vector_stores.files.list(
        vector_store_id=vector_store.id
    )
    while result.data[0].status != "completed":
        time.sleep(1)
        result = client.vector_stores.files.list(
            vector_store_id=vector_store.id
        )
    return vector_store.id

def process_json(json_string):
    if '```json' in json_string:
        json_string = json_string.split("```json")[1].split("```")[0]
    if json_string.startswith('"') and json_string.endswith('"'):
        json_string = json_string[1:-1]
    try:
        json_string = json.loads(json_string)
    except Exception as e:
        print(e)
    return json_string

def identify_opinions(task_description, vector_id):
    prompt = f"""
You need to identify or construct a diverse and comprehensive set of stakeholders, their characteristics, and their perspectives or opinions for the following evaluation task:
***{task_description}***

**Guidelines**
- For this given paper, read one paragraph at a time. Ignore the related work section and references list.
Step 1 - Identify *ALL* mentioned name entities, excluding the authors and their institutions, as well as non-human entities.
Step 2 - For each name entity (i.e., stakeholder) you identified, generate the descriptive characteristics for this stakeholder. Then extract their perspectives or opinions that are **relevant to the aforementioned evaluation task**. Each entry should be directly derived from the texts with supporting evidence. 

** Important Reminders**
- If in the provided paper, no relevant information is mentioned about the evaluation task, output nothing.
- In generation, prioritize capturing a wide range of stakeholders and their perspectives, including those that might emerge from different roles, backgrounds, and needs.
- The stakeholder's perspectives or opinions should be relevant to the aforementioned evaluation task.
- The final generated stakeholder entry should each clearly include:
     1. The stakeholder name (e.g., role or representative group),
     2. The stakeholder’s characteristics,
     3. The stakeholder’s perspectives or opinions regarding the aforementioned evaluation task,
     4. The supporting evidence from the provided papers.

**Output Format**
- If the provided paper contains relevant information about the evaluation task, present the output as a structured JSON dict, with each item formatted as an object containing the following fields:
```json
{{
    "stakeholder name": {{
        "characteristics": "use one sentence to describe the stakeholder’s characteristics",
        "perspectives": [
        {{
            "perspective": "use a sentence to describe the stakeholder’s perspectives or opinions regarding the aforementioned evaluation task",
            "evidence": "supporting evidence from the provided paper"
        }},
        {{
            "perspective": "use a sentence to describe the stakeholder’s perspectives or opinions regarding the aforementioned evaluation task",
            "evidence": "supporting evidence from the provided paper"
        }},
            ...
        ]
    }},
  "stakeholder name": {{
        "characteristics": "use one sentence to describe the stakeholder’s characteristics",
        "perspectives": [
        {{
            "perspective": "use a sentence to describe the stakeholder’s perspectives or opinions regarding the aforementioned evaluation task",
            "evidence": "supporting evidence from the provided paper"
        }},
        {{
            "perspective": "use a sentence to describe the stakeholder’s perspectives or opinions regarding the aforementioned evaluation task",
            "evidence": "supporting evidence from the provided paper"
        }},
            ...
        ]
    }},
  ...
}}
```

- Otherwise, output an empty list:
```json
[]
```
"""
    response = client.responses.create(
        model="gpt-4o",
        input=[
            {"role": "system", "content": prompt}
        ],
        text={
            "format": {
            "type": "text"
            }
        },
        reasoning={},
        tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_id]
        }],
        temperature=1,
        max_output_tokens=2048,
        top_p=1
    )
    return response.output_text

def merge_same_opinions(task_description, all_identified_opinions):
    prompt = f"""
You need to merge the same stakeholder perspectives for the following evaluation task:
***{task_description}***

**Stakeholder Opinion List**
{all_identified_opinions}

**Guidelines**
- Read through the provided stakeholder perspective list. For each stakeholder, read both their listed characteristics and all associated perspectives or opinions.
- If a stakeholder appears more than once in the list, treat them as the same individual or group and merge all entries accordingly.
- If multiple stakeholders belong to similar occupation, background, or role, add all their perspectives into one perspective list. Do not remove any perspective.
- When merging multiple entries for the same stakeholder, combine all their listed characteristics into a single, unified description.
- If the stakeholder has identical perspectives across different entries, merge them into one unified perspective.
- As long as there exists notable differences in the stakeholder's perspectives, keep them as separate perspectives.

**Output Format**
Use a dict to include all stakeholders and their perspectives.
Present the output as ONE structured JSON dict, with each stakeholder item formatted as an object containing the following fields:
```json
{{
    "stakeholder name": {{
        "characteristics": "Description of the stakeholder’s characteristics",
        "perspectives": [
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            ...
        ]
    }},
    "stakeholder name": {{
        "characteristics": "Description of the stakeholder’s characteristics",
        "perspectives": [
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            ...
        ]
    }},
    ...
}}  
```
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
        ],
        temperature=1,
        response_format={"type": "json_object"}
    )
    return response.choices[0].message.content

def create_persona_from_opinions(task_description, opinions):
    prompt = f"""
You need to create stakeholder personas for the following evaluation task:
***{task_description}***

**Guidelines**
- For the provided stakeholder information list, process one stakeholder at a time.
- For each mentioned perspective of the stakeholder, generate a distinct persona that embodies the corresponding perspective.
- For each generated persona, you must include the following attributes following these steps:
1. Based on the name and characteristic of this stakeholder type, as well as the corresponding perspective and evidence, generate the persona’s demographic information. The generated demographic information should include name, age, education, career, and personality. The generated demographic information should be diverse and realistic.
2. Based on the exact perspective, rephrase the perspective to be suitable for this persona.
3. Based on the stakeholder type and the generated demographic information, generate a specialty for this persona. The specialty could be this persona’s skill, expertise, or proficiency in real-life related to the aforementioned task.
4. Based on the stakeholder type and the generated demographic information, generate the persona’s psychological traits, such as the characteristics related to personality, emotions, interests, and cognitive tendencies.
5. Based on the stakeholder type and the generated demographic information, generate the social relationships for this persona. The social relationships could be the nature and dynamics of interactions with others, including roles, connections, and communication styles, such as parenting styles, interactions with players. The generated relationship must be within the provided stakeholder types.

**Important Reminders**:
- The generated personas should be diverse, realistic, and grounded in the provided characteristics, perspectives and evidence.
- Do not omit any perspective. Each perspective should have a unique corresponding persona.

**Stakeholder Information List**
{opinions}

**Output Format**
Present the output as a structured JSON list, where each key corresponds to a stakeholder name (as provided), and the value is a list of persona objects. Each persona should include the following fields:
```json
{{
  "Stakeholder Name (use the exact name from the provided stakeholder information list)": [
    {{
      "Name": "Full name of the persona",
      "Demographic Information": "One to two sentences describing the persona’s demographic profile.",
      "Perspective": "One to two sentences outlining the persona’s perspective.",
      "Specialty": "One to two sentences describing the persona’s skill, expertise, or proficiency in certain fields.",
      "Psychological Traits": "One to two sentences describing the persona’s characteristics related to personality, emotions, interests, and cognitive tendencies.",
      "Social Relationships": "One to two sentences describing the persona’s connection or interaction with others (the related people should be within the provided stakeholder types)."
    }},
    {{
      "Name": "Another persona name",
      ...
    }},
    ...
  ],
  "Another Stakeholder Name": [
    ...
  ]
  ...
}}
```
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
        ],
        temperature=1,
        response_format={"type": "json_object"}
    )
    return response.choices[0].message.content

def create_personas_from_paper(task_description, vector_id_dict):
    all_identified_opinions = {}
    for paper_name, vector_id in vector_id_dict.items():
        # Identify opinions
        if not os.path.exists(f"./NewOpinions2Personas/IdentifiedOpinions/{paper_name}.json"):
            identified_opinions = identify_opinions(task_description, vector_id)
            identified_opinions = process_json(identified_opinions)
            for stakeholder_name, stakeholder_info in identified_opinions.items():
                # for all the dicts in the list, check if the stakeholder's key is already in the all_identified_opinions's dicts' keys
                if stakeholder_name not in all_identified_opinions:
                    all_identified_opinions[stakeholder_name] = stakeholder_info
                else:
                    # if the stakeholder's key is already in the all_identified_opinions's dicts' keys, merge the perspectives and characteristics
                    all_identified_opinions[stakeholder_name]["perspectives"] += stakeholder_info["perspectives"]
                    all_identified_opinions[stakeholder_name]["characteristics"] += stakeholder_info["characteristics"]
            save_json(identified_opinions, f"./NewOpinions2Personas/IdentifiedOpinions/{paper_name}.json")
        else:
            identified_opinions = load_json(f"./NewOpinions2Personas/IdentifiedOpinions/{paper_name}.json")
            for stakeholder_name, stakeholder_info in identified_opinions.items():
                # for all the dicts in the list, check if the stakeholder's key is already in the all_identified_opinions's dicts' keys
                if stakeholder_name not in all_identified_opinions:
                    all_identified_opinions[stakeholder_name] = stakeholder_info
                else:
                    # if the stakeholder's key is already in the all_identified_opinions's dicts' keys, merge the perspectives and characteristics
                    all_identified_opinions[stakeholder_name]["perspectives"] += stakeholder_info["perspectives"]
                    all_identified_opinions[stakeholder_name]["characteristics"] += stakeholder_info["characteristics"]
    
    all_identified_opinions = json.dumps(all_identified_opinions, indent=4)
    print(all_identified_opinions)
    
    # Merge same opinions
    if not os.path.exists(f"./NewOpinions2Personas/MergedIdentifiedOpinions.json"):
        merged_identified_opinions = merge_same_opinions(task_description, all_identified_opinions)
        print(merged_identified_opinions)
        merged_identified_opinions = process_json(merged_identified_opinions)
        save_json(merged_identified_opinions, f"./NewOpinions2Personas/MergedIdentifiedOpinions.json")
    else:
        merged_identified_opinions = load_json(f"./NewOpinions2Personas/MergedIdentifiedOpinions.json")
    
    # Create personas
    if not os.path.exists(f"./NewOpinions2Personas/CreatedPersonas.json"):
        personas = create_persona_from_opinions(task_description, merged_identified_opinions)
        personas = process_json(personas)
        save_json(personas, f"./NewOpinions2Personas/CreatedPersonas.json")
    else:
        personas = load_json(f"./NewOpinions2Personas/CreatedPersonas.json")
    return personas

def main():
    task_description = "Evaluate the quality of AI-generated question-answer pairs from the storybook content. These AI-generated question-answer pairs are designed for the interactive storybook reading activity between parents and children aged 3 to 6, and should be grammatically correct and fluent in English. Parents expect to ask questions that are grounded in the storybook content, but introduce real-world common knowledge beyond the story content."
    document_path = "./ChildEduPapers"
    vector_id_dict = {}
    for file in os.listdir(document_path):
        if file.endswith(".pdf"):
            paper_name = file.split(".")[0]
            vector_id_dict[paper_name] = store_a_file(f"{document_path}/{file}")
    print(vector_id_dict)

    personas = create_personas_from_paper(task_description, vector_id_dict)
    print(personas)

if __name__ == "__main__":
    main()


