{
  "root": {
    "name": "Slide 10: Replace URL and Update Breakdown",
    "description": "Evaluates whether the agent correctly replaced the URL example on slide 10 with the specified new URL and updated the corresponding breakdown labels accordingly, with no extraneous/unrelated modifications.",
    "is_critical": true,
    "metadata": {},
    "children": [
      {
        "name": "Correct URL Replacement",
        "description": "Checks that the old URL example on slide 10 has been replaced with 'https://university.edu/courses/networking.html' and no other extraneous/incorrect URLs have been added.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # We'll use a VLM to check for the presence of the required URL and any extraneous URLs.\n    # Find slide 10 screenshot\n    images = [s.image_path for s in modified_ppt_screenshots if s.slide_number == 10]\n    if not images:\n        return (\"No screenshot available for slide 10.\", 0.0)\n\n    required_url = 'https://university.edu/courses/networking.html'\n    prompt = (\n        f\"Analyze the slide image. The required URL is '{required_url}'.\\n\"\n        \"1. Verify if the required URL is present on the slide, matched exactly word-for-word.\\n\"\n        \"2. Identify if there are any other URLs present on the slide.\\n\\n\"\n        \"Based on your findings, respond with ONLY one of the following keywords:\\n\"\n        \"- 'CORRECT': The required URL is present, and NO other URLs are found.\\n\"\n        \"- 'EXTRANEOUS': The required URL is present, but at least one other extraneous URL is also found.\\n\"\n        \"- 'MISSING': The required URL is NOT present.\"\n    )\n    response = vlm_call(prompt, images).strip().upper()\n\n    if response == 'CORRECT':\n        return (\"Correctly replaced URL with no extraneous URLs.\", 1.0)\n    elif response == 'EXTRANEOUS':\n        return (f\"Found the required URL, but also found extraneous URLs on slide 10.\", 0.5)\n    elif response == 'MISSING':\n        return (\"Did not find the required URL on slide 10.\", 0.0)\n    else:\n        # Handle cases where the VLM might not return one of the expected keywords\n        return (\"Could not determine URL status from VLM response.\", 0.0)\n"
        },
        "score": 1.0
      },
      {
        "name": "Correct Breakdown Labels Update",
        "description": "Checks that the breakdown of the new URL on slide 10 correctly reflects the components of 'https://university.edu/courses/networking.html', and does not display labels corresponding to the previous URL structure.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # We'll use a VLM to check the screenshot for a diagram and its position in a single call.\n    # Find slide 10 screenshot\n    images = [s.image_path for s in modified_ppt_screenshots if s.slide_number == 10]\n    if not images:\n        return (\"No screenshot available for slide 10.\", 0.0)\n\n    prompt = (\n        \"Check if the slide image shows the URL 'university.edu/courses/networking.html' with 'host name' and 'path name' labels correctly aligned to their parts.\"\n        \" 100% precise alignment is not required; if the breakdown is understandable, mark as correct.\"\n        \" Reply ONLY with one keyword:\\n\"\n        \"- BOTH: Both labels are correctly aligned.\\n\"\n        \"- HOST: Only 'host name' is correctly aligned.\\n\"\n        \"- PATH: Only 'path name' is correctly aligned.\\n\"\n        \"- NEITHER: Neither label is correctly aligned.\"\n    )\n    response = vlm_call(prompt, images).strip().upper()\n\n    if response == \"BOTH\":\n        return (\"Both 'host name' and 'path name' labels are correctly aligned.\", 1.0)\n    elif response == \"HOST\" or response == \"PATH\":\n        return (\"One of the two labels ('host name' or 'path name') is correctly aligned.\", 0.5)\n    else:  # NEITHER or any other response\n        return (\"Neither 'host name' nor 'path name' labels are correctly aligned.\", 0.0)\n"
        },
        "score": 1.0
      },
      {
        "name": "No extraneous changes to other slides or features",
        "description": "Checks that no unrelated changes were made to slides other than slide 10, or to slide 10's animations/transitions or other features.",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "No extraneous text/font changes on other slides",
            "description": "Checks that text and font properties on slides other than slide 10 were not changed.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    orig = Presentation(original_ppt_path)\n    mod = Presentation(modified_ppt_path)\n    # For each slide except slide 10\n    for slide_idx in range(len(orig.slides)):\n        if slide_idx == 9:\n            continue\n        orig_slide = orig.slides[slide_idx]\n        mod_slide = mod.slides[slide_idx] if slide_idx < len(mod.slides) else None\n        if not mod_slide:\n            continue  # Slide might be deleted, not expected for this task\n        def extract_text_fontsize(shape):\n            data = []\n            if not shape.has_text_frame:\n                return data\n            for para in shape.text_frame.paragraphs:\n                for run in para.runs:\n                    if run.text.strip():\n                        size = run.font.size.pt if run.font.size else None\n                        data.append((run.text.strip(), size))\n            return data\n        def get_slide_text_fontsize(slide):\n            out = []\n            for shape in slide.shapes:\n                out.extend(extract_text_fontsize(shape))\n            return out\n        orig_text_sizes = get_slide_text_fontsize(orig_slide)\n        mod_text_sizes = get_slide_text_fontsize(mod_slide)\n        orig_map = {}\n        for t, sz in orig_text_sizes:\n            orig_map.setdefault(t, []).append(sz)\n        mod_map = {}\n        for t, sz in mod_text_sizes:\n            mod_map.setdefault(t, []).append(sz)\n        # For every text, check font sizes unchanged\n        for text in orig_map:\n            if text not in mod_map:\n                continue\n            if orig_map[text] != mod_map[text]:\n                return (f\"Font size for text '{text}' changed on slide {slide_idx+1}.\", 0.0)\n    return (\"No text/font changes on other slides.\", 1.0)\n"
            },
            "score": 1.0
          },
          {
            "name": "No extraneous changes to animations or transitions on any slide",
            "description": "Checks that no animation or transition was added, removed, or modified in the presentation.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    # Check ppt_diff for any animation/transition changes\n    if ppt_diff.added_animations or ppt_diff.removed_animations or ppt_diff.modified_animations:\n        return (\"Extraneous animation change(s) detected.\", 0.0)\n    if ppt_diff.added_transitions or ppt_diff.removed_transitions or ppt_diff.modified_transitions:\n        return (\"Extraneous transition change(s) detected.\", 0.0)\n    return (\"No extraneous animation or transition changes.\", 1.0)\n"
            },
            "score": 1.0
          }
        ],
        "score": 1.0
      },
      {
        "name": "No extraneous slides added or removed",
        "description": "Checks that no slides were added or removed in the presentation.",
        "is_critical": false,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # Use ppt_diff for slide structure changes\n    if ppt_diff.added_slides:\n        return (\"Slides were added, which is extraneous.\", 0.0)\n    if ppt_diff.removed_slides:\n        return (\"Slides were removed, which is extraneous.\", 0.0)\n    return (\"No extraneous slides added or removed.\", 1.0)\n"
        },
        "score": 1.0
      }
    ],
    "score": 1.0
  },
  "metadata": {
    "task": "Slide 10: Replace the URL example with 'https://university.edu/courses/networking.html' and update the URL labels breakdown accordingly"
  }
}