{
  "root": {
    "name": "Change font color of 'Conclusion:' on slide 23 from white to yellow",
    "description": "Evaluates whether the agent successfully changed the font color of the text 'Conclusion:' on slide 23 from white to yellow, without introducing unintended modifications.",
    "is_critical": true,
    "metadata": {},
    "children": [
      {
        "name": "Correctly changed font color of 'Conclusion:' to yellow on slide 23",
        "description": "Checks that the text 'Conclusion:' on slide 23 is present and its font color has been changed to yellow.",
        "is_critical": true,
        "metadata": {},
        "children": [
          {
            "name": "Text 'Conclusion:' is present on slide 23",
            "description": "Verifies that the text 'Conclusion:' still exists on slide 23 after the modification.",
            "is_critical": true,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    prs = Presentation(modified_ppt_path)\n    slide = None\n    for s_idx, s in enumerate(prs.slides, 1):\n        if s_idx == 23:\n            slide = s\n            break\n    if slide is None:\n        return (\"Slide 23 does not exist in the modified presentation.\", 0.0)\n    found = False\n    for shape in slide.shapes:\n        if not shape.has_text_frame:\n            continue\n        for paragraph in shape.text_frame.paragraphs:\n            if 'Conclusion:' in paragraph.text:\n                found = True\n                break\n        if found:\n            break\n    if found:\n        return (\"'Conclusion:' text is present on slide 23.\", 1.0)\n    else:\n        return (\"'Conclusion:' text is missing from slide 23.\", 0.0)\n"
            }
          },
          {
            "name": "Font color of 'Conclusion:' on slide 23 is yellow",
            "description": "Checks that the font color of the text 'Conclusion:' on slide 23 is set to yellow.",
            "is_critical": true,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    # Find slide 5 screenshot\n    img = None\n    for ss in modified_ppt_screenshots:\n        if ss.slide_number == 23:\n            img = ss.image_path\n            break\n    if not img:\n        return \"No screenshot of slide 23 found.\", 0.0\n    # Use VLM to check if the text 'Conclusion' is yellow\n    prompt = \"Is the text 'Conclusion' yellow? Reply 'yes' or 'no'.\"\n    result = vlm_call(prompt, [img], temperature=0.0, max_tokens=10)\n    if 'yes' in result.lower():\n        return \"The text 'Conclusion' is yellow.\", 1.0\n    return f\"VLM response: {result}\", 0.0\n"
            }
          }
        ]
      },
      {
        "name": "No extraneous changes were made",
        "description": "Checks that no unrelated modifications were made to animations, transitions, slides, or slide content except the intended font color change.",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "No unrelated animations or transitions were changed",
            "description": "Ensures that no animations or transitions were added, removed, or modified.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    if (\n        ppt_diff.added_animations or ppt_diff.removed_animations or ppt_diff.modified_animations or\n        ppt_diff.added_transitions or ppt_diff.removed_transitions or ppt_diff.modified_transitions\n    ):\n        return (\"Animations or transitions were modified.\", 0.0)\n    return (\"No animations or transitions were changed.\", 1.0)\n"
            }
          },
          {
            "name": "No unrelated slides were added or removed",
            "description": "Ensures that no slides other than slide 23 were added or removed.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    unrelated_added = [s for s in ppt_diff.added_slides if s.slide_number != 23]\n    unrelated_removed = [s for s in ppt_diff.removed_slides if s.slide_number != 23]\n    if unrelated_added or unrelated_removed:\n        return (\"Unrelated slides were added or removed.\", 0.0)\n    return (\"No unrelated slides were added or removed.\", 1.0)\n"
            }
          },
          {
            "name": "No unrelated content was changed on slide 23",
            "description": "Ensures that, aside from the font color of 'Conclusion:', no other content on slide 23 was changed.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    # We'll use VLM to compare screenshots of slide 23 before and after, except for the 'Conclusion:' text region.\n    # Try to avoid VLM, but for general content change detection, it's the best option.\n    orig = next((s for s in original_ppt_screenshots if s.slide_number == 23), None)\n    mod = next((s for s in modified_ppt_screenshots if s.slide_number == 23), None)\n    if not orig or not mod:\n        return (\"Missing slide 23 screenshots for visual comparison.\", 0.0)\n    prompt = (\n        \"Compare these two images of slide 23 in a PowerPoint presentation. The only acceptable difference \"\n        \"should be that the text 'Conclusion:' changes from white to yellow. If you see any other differences, \"\n        \"explain what they are. If the only difference is as described, say 'Only intended change'.\"\n    )\n    result = vlm_call(prompt, [orig.image_path, mod.image_path], temperature=0.0, max_tokens=128)\n    if 'only intended change' in result.lower():\n        return (\"No unrelated content was changed on slide 23.\", 1.0)\n    return (f\"Detected unrelated changes on slide 23: {result}\", 0.0)\n"
            }
          }
        ]
      }
    ]
  },
  "metadata": {
    "task": "Change the font color of 'Conclusion:' on slide 23 from white to yellow"
  }
}