import os
import json
import requests
from io import BytesIO
import time
import PyPDF2
import openai
import wandb

MODEL_NAME = "Qwen/Qwen3-235B-A22B"

from sglang.utils import launch_server_cmd
from sglang.utils import wait_for_server, print_highlight, terminate_process

server_process, port = launch_server_cmd(
    f"uv run -m sglang.launch_server --model-path {MODEL_NAME} --tp 8 --attention-backend fa3"
)


wait_for_server(f"http://localhost:{port}")
print(f"Server started on http://localhost:{port}")

client = openai.AsyncClient(base_url=f"http://localhost:{port}/v1", api_key="None")

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)

def process_json(json_string):
    if '```json' in json_string:
        json_string = json_string.split("```json")[1].split("```", 1)[0]
    if json_string.startswith('"') and json_string.endswith('"'):
        json_string = json_string[1:-1]
    try:
        json_string = json.loads(json_string)
    except Exception as e:
        print(e)
    return json_string

def read_pdf_content(file_path):
    """Read content from a PDF file."""
    try:
        if file_path.startswith("http://") or file_path.startswith("https://"):
            # Download the file content from the URL
            response = requests.get(file_path)
            pdf_file = BytesIO(response.content)
        else:
            # Open local file
            pdf_file = open(file_path, "rb")
        # Create PDF reader object
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        # Extract text from all pages
        text_content = ""
        for page in pdf_reader.pages:
            text_content += page.extract_text() + "\n"
        if not isinstance(pdf_file, BytesIO):
            pdf_file.close()
        return text_content
    except Exception as e:
        print(f"Error reading PDF {file_path}: {e}")
        return None

def identify_opinions(task_description, paper_content):
    messages = [
        {"role": "user", "content": f"""You need to identify or construct a diverse and comprehensive set of stakeholders, their characteristics, and their perspectives for the following text evaluation task:
***{task_description}***

Here is the paper content to analyze:
{paper_content}

**Guidelines**
- For this given paper, read one paragraph at a time. Ignore the related work section and references list.
Step 1 - Identify *ALL* mentioned name entities, excluding the authors and their institutions, as well as non-human entities.
Step 2 - For each name entity (i.e., stakeholder) you identified, generate the descriptive characteristics for this stakeholder. Then extract the stakeholder’s evaluative perspectives or opinions (e.g., criteria, dimensions, metrics, values, concerns) that are explicitly tied to the aforementioned text evaluation task. 
- The perspective focus should be the evaluation content mentioned in the task description.
- All extracted information must be directly supported by the source text and accompanied by clear textual evidence.

** Important Reminders**
- If in the provided paper, no relevant information is mentioned about the evaluation task, output nothing.
- In generation, prioritize capturing a wide range of stakeholders and their perspectives, including those that might emerge from different roles, backgrounds, and needs.
- Each stakeholder is expected to have multiple perspectives or opinions. Do not include the stakeholder if there's only one perspective or opinion.
- The final generated stakeholder entry should each clearly include:
     1. The stakeholder name (e.g., representative role or group, should be *distinct* from other stakeholders),
     2. The stakeholder's characteristics,
     3. The stakeholder's perspectives that are specifically related to the aforementioned evaluation task,
     4. The supporting evidence from the provided papers.
    
**Output Format**
- If the provided paper contains relevant information about the evaluation task, present the output as a structured JSON dict, with each item formatted as an object containing the following fields:
```json
{{
    "stakeholder name (representative *HUMAN* role or group)": {{
        "characteristics": "use one sentence to describe the stakeholder's characteristics",
        "perspectives": [
            {{
                "perspective": "use a sentence to describe the stakeholder's perspectives or opinions that are relevant to the aforementioned evaluation task",
                "evidence": "supporting evidence from the provided paper"
            }},
            {{
                "perspective": "use a sentence to describe the stakeholder's perspectives or opinions that are relevant to the aforementioned evaluation task",
                "evidence": "supporting evidence from the provided paper"
            }},
            ...
        ]
    }},
    "another stakeholder name (representative *HUMAN* role or group)": {{
        "characteristics": "use one sentence to describe the stakeholder's characteristics",
        "perspectives": [
            {{
                "perspective": "use a sentence to describe the stakeholder's perspectives or opinions that are relevant to the aforementioned evaluation task",
                "evidence": "supporting evidence from the provided paper"
            }},
            {{
                "perspective": "use a sentence to describe the stakeholder's perspectives or opinions that are relevant to the aforementioned evaluation task",
                "evidence": "supporting evidence from the provided paper"
            }},
            ...
        ]
    }},
    "another stakeholder name (representative *HUMAN* role or group)": {{
        ...
    }},
    ...
}}
```

- Otherwise, output an empty list:
```json
[]
```

- You must strictly follow the provided format and output the JSON object directly. 
- You must include valid content in the JSON object, instead of duplicating the provided output format.
"""}
    ]
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=1,
        response_format={"type": "json_object"},
        extra_body={
            "chat_template_kwargs": {"enable_thinking": True},
        },
    )
    return response.choices[0].message.content

def merge_same_opinions(task_description, all_identified_opinions):
    messages = [
        {"role": "user", "content": f"""You need to merge the same stakeholder perspectives for the following evaluation task:
***{task_description}***

**Stakeholder Opinion List**
{all_identified_opinions}

**Guidelines**
- Read through the provided stakeholder perspective list. For each stakeholder, read both their listed characteristics and all associated perspectives.
- If multiple stakeholders belong to the same or closely related category (e.g., similar occupations, backgrounds, or roles), treat them as the same stakeholder and *merge their perspective entries that are relevant to the aforementioned evaluation task accordingly*.
- Each stakeholder is expected to have multiple perspectives or opinions. Do not include the stakeholder if there's only one perspective or opinion.
- When merging multiple entries for the same stakeholder, combine all their listed characteristics into a single, unified description.

**Output Format**
Use a dict to include all stakeholders and their perspectives.
Present the output as ONE structured JSON dict, with each stakeholder item formatted as an object containing the following fields:
```json
{{
    "stakeholder name": {{
        "characteristics": "Description of the stakeholder's characteristics",
        "perspectives": [
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            ...
        ]
    }},
    "stakeholder name": {{
        "characteristics": "Description of the stakeholder's characteristics",
        "perspectives": [
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            {{
                "perspective": "...",
                "evidence": "..."
            }},
            ...
        ]
    }},
    ...
}}  
```
"""}
    ]
    response = client.chat.completions.create(
        model=MODEL_NAME,
        # model = "gpt-3.5-turbo",
        messages=messages,
        temperature=1,
        response_format={"type": "json_object"},
        extra_body={
            "chat_template_kwargs": {"enable_thinking": True},
        },
    )
    return response.choices[0].message.content

def create_persona_from_opinions(task_description, opinions):
    messages = [
        {"role": "user", "content": f"""You need to create stakeholder personas for the following evaluation task:
***{task_description}***

IMPORTANT: 
- You must immediately generate the complete JSON output following the structure below.
- Do not ask any questions, do not confirm any details, and do not provide any explanation. 
- Begin your response directly with the JSON object. 
- Do not use any natural language before or after the output.

**Guidelines**
- For EACH mentioned perspective of the stakeholder, generate a distinct persona that embodies the corresponding perspective.
- For EACH generated persona, you must include the following attributes following these steps:
1. Based on the name and characteristic of this stakeholder type, as well as the corresponding perspective and evidence, generate the persona's demographic information. The generated demographic information should include name, age, education, career, and personality. The generated demographic information should be diverse and realistic.
2. Based on the exact perspective, rephrase the perspective to be suitable for this persona and the aforementioned text evaluation task. Do not add unnecessary information or details.
3. Based on the stakeholder type and the generated demographic information, generate a specialty for this persona. The specialty could be this persona's skill, expertise, or proficiency in real-life related to the aforementioned task.
4. Based on the stakeholder type and the generated demographic information, generate the persona's psychological traits, such as the characteristics related to personality, emotions, interests, and cognitive tendencies.
5. Based on the stakeholder type and the generated demographic information, generate the social relationships for this persona. The social relationships could be the nature and dynamics of interactions with others, including roles, connections, and communication styles, such as parenting styles, interactions with players. The generated relationship must be within the provided stakeholder types.

**Important Reminders**:
- The generated personas should be diverse, realistic, and grounded in the provided characteristics, perspectives and evidence.
- *Do not omit any perspective*. You must create a corresponding persona for each perspective.
- You should not merge the perspectives of different stakeholders into one persona. For each stakeholder, the total number of personas should be equal to the total number of perspectives. For instance, if one stakeholder has six perspectives, you should create six personas under this stakeholder.

**Stakeholder Perspective List**
{opinions}

IMPORTANT: You must process **ALL** stakeholders in the list, not just the first few.

**Output Format**
Directly output a structured JSON list including personas for *ALL* stakeholders in the provided list. Each key corresponds to a stakeholder name (as provided), and the value is a list of persona objects.
```json
{{
  "Stakeholder Name (use the exact name from the provided stakeholder information list)": [
      {{
            "Name": "Full name of the persona",
            "Demographic Information": "One to two sentences describing the persona's demographic profile.",
            "Perspective": "One to two sentences outlining the persona's perspective.",
            "Specialty": "One to two sentences describing the persona's skill, expertise, or proficiency in certain fields.",
            "Psychological Traits": "One to two sentences describing the persona's characteristics related to personality, emotions, interests, and cognitive tendencies.",
            "Social Relationships": "One to two sentences describing the persona's connection or interaction with others (the related people should be within the provided stakeholder types)."
        }},
        {{
            "Name": "Another persona name",
            ...
        }},
        {{
            "Name": "Another persona name",
            ...
        }},
        ... (Do not omit any persona. The number of personas should be equal to the total number of perspectives)
      ],
  "Stakeholder Name (use the exact name from the provided stakeholder information list)": [
      {{
            "Name": "Full name of the persona",
            "Demographic Information": "One to two sentences describing the persona's demographic profile.",
            "Perspective": "One to two sentences outlining the persona's perspective.",
            "Specialty": "One to two sentences describing the persona's skill, expertise, or proficiency in certain fields.",
            "Psychological Traits": "One to two sentences describing the persona's characteristics related to personality, emotions, interests, and cognitive tendencies.",
            "Social Relationships": "One to two sentences describing the persona's connection or interaction with others (the related people should be within the provided stakeholder types)."
        }},
        {{
            "Name": "Another persona name",
            ...
        }},
        {{
            "Name": "Another persona name",
            ...
        }},
        ... (Do not omit any persona. The number of personas should be equal to the total number of perspectives)
      ],
  "Stakeholder Name": [
      ... (Do not omit any persona. The number of personas should be equal to the total number of perspectives)
      ],
  ... (Include all stakeholders personas)
}}
```

Final Notes:
- Do not skip any stakeholder group.
- Every stakeholder must appear as a top-level key.
- Each persona in the list must match exactly one unique perspective.
- You must directly output a complete JSON dictionary with ALL stakeholder personas included. No question, placeholder or "more would follow" comments allowed.
"""}
    ]
    response = client.chat.completions.create(
        model=MODEL_NAME,
        # model = "gpt-3.5-turbo",
        messages=messages,
        temperature=1,
        response_format={"type": "json_object"},
        extra_body={
            "chat_template_kwargs": {"enable_thinking": True},
        },
    )
    return response.choices[0].message.content

def create_personas_from_paper(task_description, document_path):
    # Create directory if not exists
    os.makedirs("./Qwen235Opinions2Personas/IdentifiedOpinions", exist_ok=True)
    all_identified_opinions = {}
    for file in os.listdir(document_path):
        if file.endswith(".pdf"):
            paper_name = file.split(".")[0]
            # Read and process PDF content
            if not os.path.exists(f"./Qwen235Opinions2Personas/IdentifiedOpinions/{paper_name}.json"):
                paper_content = read_pdf_content(f"{document_path}/{file}")
                if paper_content:
                    identified_opinions = identify_opinions(task_description, paper_content)
                    identified_opinions = process_json(identified_opinions)
                    for stakeholder_name, stakeholder_info in identified_opinions.items():
                        if stakeholder_name not in all_identified_opinions:
                            all_identified_opinions[stakeholder_name] = stakeholder_info
                        else:
                            all_identified_opinions[stakeholder_name]["perspectives"] += stakeholder_info["perspectives"]
                            all_identified_opinions[stakeholder_name]["characteristics"] += stakeholder_info["characteristics"]
                    save_json(identified_opinions, f"./Qwen235Opinions2Personas/IdentifiedOpinions/{paper_name}.json")
            else:
                identified_opinions = load_json(f"./Qwen235Opinions2Personas/IdentifiedOpinions/{paper_name}.json")
                for stakeholder_name, stakeholder_info in identified_opinions.items():
                    if stakeholder_name not in all_identified_opinions:
                        all_identified_opinions[stakeholder_name] = stakeholder_info
                    else:
                        all_identified_opinions[stakeholder_name]["perspectives"] += stakeholder_info["perspectives"]
                        all_identified_opinions[stakeholder_name]["characteristics"] += stakeholder_info["characteristics"]
    all_identified_opinions = json.dumps(all_identified_opinions, indent=4)
    print(all_identified_opinions)
    # Merge same opinions
    if not os.path.exists(f"./Qwen235Opinions2Personas/MergedIdentifiedOpinions.json"):
        merged_identified_opinions = merge_same_opinions(task_description, all_identified_opinions)
        print(merged_identified_opinions)
        merged_identified_opinions = process_json(merged_identified_opinions)
        save_json(merged_identified_opinions, f"./Qwen235Opinions2Personas/MergedIdentifiedOpinions.json")
    else:
        merged_identified_opinions = load_json(f"./Qwen235Opinions2Personas/MergedIdentifiedOpinions.json")
    # Create personas
    if not os.path.exists(f"./Qwen235Opinions2Personas/CreatedPersonas.json"):
        personas = create_persona_from_opinions(task_description, merged_identified_opinions)
        personas = process_json(personas)
        save_json(personas, f"./Qwen235Opinions2Personas/CreatedPersonas.json")
    else:
        personas = load_json(f"./Qwen235Opinions2Personas/CreatedPersonas.json")
    return personas

def main():
    task_description = "Evaluate the quality of AI-generated question-answer pairs from the storybook content. These AI-generated question-answer pairs are designed for the interactive storybook reading activity between parents and children aged 3 to 6, and should be grammatically correct and fluent in English. Parents expect to ask questions that are grounded in the storybook content, but introduce real-world common knowledge beyond the story content."

    document_path = "./ChildEduPapers"
    personas = create_personas_from_paper(task_description, document_path)
    print(personas)
    # --- wandb integration ---
    wandb.init(project="persona_creation_qwen", name="test-upload")
    artifact = wandb.Artifact('Qwen235Opinions2Personas_jsons', type='output')
    # Add all JSON files in Qwen235Opinions2Personas and its IdentifiedOpinions subdir
    for root, dirs, files in os.walk("./Qwen235Opinions2Personas"):
        for file in files:
            if file.endswith('.json'):
                artifact.add_file(os.path.join(root, file))
    wandb.log_artifact(artifact)
    wandb.finish()

if __name__ == "__main__":
    main()

