def get_description_prompt(args, data):
    special_tokens = get_special_tokens(args)

    statistical_options = [
        "Bar Chart",
        "Line Graph",
        "Pie Chart",
        "Map",
    ]
    scientific_options = [
        "Astronomy",
        "Biology",
        "Chemistry",
        "Computer Science",
        "History",
        "Mathematics",
        "Music",
        "Physics",
    ]

    prompt = special_tokens["botext"]
    prompt += (
        f'{special_tokens["bosys"]}'
        'You are a scene graph construction assistant. '
        'A scene graph is a structured representation of a diagram that captures its elements as nodes and their relationships as edges. '
        'The term "diagram" here refers to any complicated image, including charts, infographics, '
        'illustrations, academic diagrams, and other visually complex representations. \n'
        'Your task is to generate a detailed language-based description of a scene graph for a provided diagram. \n'
        f'{special_tokens["eot"]}\n'
    )
    if "tag" in data and data["tag"] in statistical_options:
        prompt += (
            f'{special_tokens["bouser"]}'
            f'{special_tokens["img"]}'
            f'{wiki_text_info(data) if args.use_wiki_text == "wiki" else ""}'
            'Instructions: \n'
            '1. Identify key elements such as axes, labels, legends, colors, and numerical values. \n'
            '2. Describe trends, patterns, or outliers in the data, including peaks, drops, or correlations. \n'
            '3. Explain relationships between different variables if applicable. \n'
            '4. Describe geographical features, directional arrows, and colored regions if applicable. \n'
            '5. Use clear and structured language. No need to explicitly mention the instructions above. \n'
            'Example Sentences: \n'
            'The bar representing Q3 in 2019 is the tallest among all quarters. \n'
            'The blue line in the graph shows a steady increase from 2010 to 2018. \n'
            'The dark green segment in the pie chart represents 45.9 TWh of diesel consumption. \n'
            'The shaded region in the map highlights areas with the highest population density. \n'
            'The thick arrow marks the strongest southeastern wind current towards the country. \n'
            f'{special_tokens["eot"]}\n'
        )
    elif "tag" in data and data["tag"] in scientific_options:
        prompt += (
            f'{special_tokens["bouser"]}'
            f'{special_tokens["img"]}'
            f'{wiki_text_info(data) if args.use_wiki_text == "wiki" else ""}'
            'Instructions: \n'
            '1. Identify key objects, such as text, arrows, nodes, or data points. \n'
            '2. Identify attributes, such as size, color, shape, orientation, position, and numerical values. \n'
            '3. Explain how objects interact or relate to one another. \n'
            '4. Describe its overall hierarchy, structure or flow clearly if applicable. \n'
            '5. Use clear and structured language. No need to explicitly mention the instructions above. \n'
            'Example Sentences: \n'
            'The newly discovered moon is connected to its elliptical orbit around Neptune. \n'
            'The blue alpha-helices are connected to beta-sheets through loop regions. \n'
            'The amine group (-NH₂) is added to the benzene ring at a new position after the reaction. \n'
            'Each yellow triangular face is attached to three metallic rods at its edges via spherical joints. \n'
            'The E-flat note is positioned directly below the B-flat note on the staff. \n'
            f'{special_tokens["eot"]}\n'
        )
    else:
        prompt += (
            f'{special_tokens["bouser"]}'
            f'{special_tokens["img"]}'
            f'{wiki_text_info(data) if args.use_wiki_text == "wiki" else ""}'
            'Instructions: \n'
            '1. Identify Key Objects: List out important objects in the image, such as axes, labels, shapes, arrows, text, nodes, or data points. \n'
            '2. Describe Attributes: Include relevant details like size, color, shape, orientation, position, numerical values, and labels. \n'
            '3. Define Relationships: Explain how objects interact or relate to one another. \n'
            '4. Explain Structural or Hierarchical Information: If the diagram represents a flow, tree, or network, describe its structure clearly. \n'
            '5. Highlight Data Trends (if applicable): If the image contains data (e.g., a chart or graph), explain trends, outliers, or meaningful patterns. \n'
            '6. Use Clear and Structured Language. No need to explicitly mention the instructions above. \n'
            'Example Sentences: \n'
            'The bar chart displays quarterly revenue for "Company A", "Company B", and "Company C". The Y-axis ranges from $0 to $50 million, with "Company A" having the highest revenue. \n'
            'A subway map highlights the "Red Line", "Blue Line", and "Green Line". Major stations such as "Central Station" and "Union Square" are marked with black dots. \n'
            'The molecular diagram represents magnesium phosphate, with two "Mg²⁺" ions interacting with phosphate groups containing oxygen and phosphorus atoms. \n'
            'A flowchart visualizes an e-commerce order process, beginning with "User Adds Item to Cart" and ending at "Order Shipped", with a decision node for "Payment Successful?". \n'
            'A hierarchical tree diagram shows a company structure where "CEO" branches into three departments: "Engineering", "Marketing", and "Finance", each with their respective managers. \n'
            f'{special_tokens["eot"]}\n'
        )
    prompt += special_tokens["boasst"]

    return prompt


def get_triple_prompt(args, data):
    special_tokens = get_special_tokens(args)

    prompt = special_tokens["botext"]
    prompt += (
        f'{special_tokens["bosys"]}'
        'You are an expert information extraction assistant specializing in scene graph construction. '
        'Your task is to analyze a given diagram description and extract meaningful, structured relationships between key elements. \n'
        f'{special_tokens["eot"]}\n'
    )
    prompt += (
        f'{special_tokens["bouser"]}'
        f'{special_tokens["img"]}'
        'The description of the diagram is provided below: \n'
        f'{data["description"]}\n\n'
        'Instructions: \n'
        '1. Identify important relationships between key elements from the description.\n'
        '2. Structure these relationships in the form of triples with three components:\n'
        '   - **Source**: The primary element (subject) in the relationship.\n'
        '   - **Relationship**: The type of connection between the source and target.\n'
        '   - **Target**: The secondary element (object) in the relationship.\n'
        '3. Ensure that:\n'
        '   - Each triple represents a meaningful connection between elements.\n'
        '   - The relationships are concise yet descriptive (e.g., "connects to", "is labeled as", "is part of").\n'
        '   - There are no duplicate, redundant, or meaningless triples.\n'
        '4. The final output must strictly follow the JSON format below:\n'
        '{\n'
        '  "1": {"Source": "Triple 1", "Relationship": "Triple 1", "Target": "Triple 1"}, \n'
        '  "2": {"Source": "Triple 2", "Relationship": "Triple 2", "Target": "Triple 2"}, \n'
        '  ... \n'
        '  "N": {"Source": "Triple N", "Relationship": "Triple N", "Target": "Triple N"}\n'
        '}\n'
        '5. Ensure JSON validity. No explanations, extra text, or additional formatting beyond the JSON structure.\n'
        f'{special_tokens["eot"]}\n'
    )
    prompt += special_tokens["boasst"]

    return prompt


def get_special_tokens(args):
    if args.lvlm_path == "meta-llama/Llama-3.2-11B-Vision":
        special_tokens = {
            "botext": '<|begin_of_text|>\n\n',
            "bosys": '<|start_header_id|>system<|end_header_id|>\n',
            "bouser": '<|start_header_id|>user<|end_header_id|>\n',
            "boasst": '<|start_header_id|>assistant<|end_header_id|>\n',
            "eot": '<|eot_id|>\n',
            "img": 'Diagram: <|image|>\n'
        }
    else:
        special_tokens = {"botext": '', "bosys": '', "bouser": '', "boasst": '', "eot": '', "img": ''}
    return special_tokens


def wiki_text_info(data):
    page_data = data['webpages'][0]
    page_title = page_data.get('page_title', 'Unknown Title')
    page_desc = page_data.get('clean_page_description', 'Not provided.')
    image_ref_desc = data.get('image_ref_desc', 'Not provided.')
    image_ref_desc = image_ref_desc if len(image_ref_desc) <= 500 else 'Not provided.'

    prompt = (
        'The diagram is sourced from Wikipedia, and here is some background information: \n'
        f'- Page Title: {page_title}\n'
        f'- Page Description: {page_desc}\n'
        f'- Diagram Description: {image_ref_desc}\n'
        'Use the Wikipedia information above only if the diagram alone does not provide enough clarity or context. '
        'Always give priority to the information directly visible in the diagram for your analysis. '
        '\n\n'
    )
    return prompt

