{
  "root": {
    "name": "Replace IP Address and Port Bullet Points with Diagram on Slide 8",
    "description": "Evaluates whether the agent removed the bullet points for IP address and port number from slide 8 and replaced them with a simple diagram showing '128.119.245.12' connected to 'Port 80'.",
    "is_critical": true,
    "metadata": {},
    "children": [
      {
        "name": "IP Address and Port Bullet Points Removed",
        "description": "Verifies that no bullet point mentioning an IP address (128.119.245.12) or port number appears on slide 8.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # We'll use a VLM to check the screenshot for a diagram and the correct text labels\n    # Find slide 8 screenshot\n    images = [s.image_path for s in modified_ppt_screenshots if s.slide_number == 8]\n    if not images:\n        return (\"No screenshot available for slide 8.\", 0.0)\n    prompt = (\n        \"Does this slide contain a bullet point with the text 'IP address: 128.119.245.12' or 'Port number: 80'? \"\n        \"Only check for bullet points. If this text appears as part of a diagram or figure, ignore it. \"\n        \"Answer only YES or NO.\"\n    )\n    response = vlm_call(prompt, images)\n    if \"no\" in response.lower():\n        return (\"Neither IP nor Port bullet points present in slide 8.\", 1.0)\n    else:\n        return (\"IP address or Port number bullet point still present.\", 0.0)\n"
        },
        "score": 1.0
      },
      {
        "name": "Diagram Presence and Content",
        "description": "Verifies an appropriate diagram is present showing '128.119.245.12' connected to 'Port 80' on slide 8.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # We'll use a VLM to check the screenshot for a diagram and its position in a single call.\n    # Find slide 8 screenshot\n    images = [s.image_path for s in modified_ppt_screenshots if s.slide_number == 8]\n    if not images:\n        return (\"No screenshot available for slide 8.\", 0.0)\n\n    prompt = (\n        \"Analyze the slide for a simple diagram (such as a box/ellipse/arrow/line) showing the IP address 128.119.245.12 connected to Port 80, with clear text labels for both. \"\n        \"Based on its presence and location, answer with ONLY one of the following words: 'CORRECT', 'WRONG_POSITION', or 'NONE'.\\n\"\n        \"- 'CORRECT': The diagram exists and is located directly under the text 'gaia.cs.umass.edu'.\\n\"\n        \"- 'WRONG_POSITION': The diagram exists but is NOT located directly under the bullet point containing 'gaia.cs.umass.edu'.\\n\"\n        \"- 'NONE': The diagram does not exist at all.\"\n    )\n    response = vlm_call(prompt, images).lower()\n\n    if \"correct\" in response:\n        return (\"Diagram with correct labels detected under 'gaia.cs.umass.edu' on slide 8.\", 1.0)\n    elif \"wrong_position\" in response:\n        return (\"Diagram with correct labels detected on slide 8, but not in the correct position.\", 0.7)\n    else:  # \"none\" or anything else\n        return (\"No correct diagram detected on slide 8.\", 0.0)\n"
        },
        "score": 1.0
      },
      {
        "name": "No extraneous changes to other slides or features",
        "description": "Checks that no unrelated changes were made to slides other than slide 8, or to slide 8's animations/transitions or other features.",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "No extraneous text/font changes on other slides",
            "description": "Checks that text and font properties on slides other than slide 8 were not changed.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    orig = Presentation(original_ppt_path)\n    mod = Presentation(modified_ppt_path)\n    # For each slide except slide 8\n    for slide_idx in range(len(orig.slides)):\n        if slide_idx == 7:\n            continue\n        orig_slide = orig.slides[slide_idx]\n        mod_slide = mod.slides[slide_idx] if slide_idx < len(mod.slides) else None\n        if not mod_slide:\n            continue  # Slide might be deleted, not expected for this task\n        def extract_text_fontsize(shape):\n            data = []\n            if not shape.has_text_frame:\n                return data\n            for para in shape.text_frame.paragraphs:\n                for run in para.runs:\n                    if run.text.strip():\n                        size = run.font.size.pt if run.font.size else None\n                        data.append((run.text.strip(), size))\n            return data\n        def get_slide_text_fontsize(slide):\n            out = []\n            for shape in slide.shapes:\n                out.extend(extract_text_fontsize(shape))\n            return out\n        orig_text_sizes = get_slide_text_fontsize(orig_slide)\n        mod_text_sizes = get_slide_text_fontsize(mod_slide)\n        orig_map = {}\n        for t, sz in orig_text_sizes:\n            orig_map.setdefault(t, []).append(sz)\n        mod_map = {}\n        for t, sz in mod_text_sizes:\n            mod_map.setdefault(t, []).append(sz)\n        # For every text, check font sizes unchanged\n        for text in orig_map:\n            if text not in mod_map:\n                continue\n            if orig_map[text] != mod_map[text]:\n                return (f\"Font size for text '{text}' changed on slide {slide_idx+1}.\", 0.0)\n    return (\"No text/font changes on other slides.\", 1.0)\n"
            },
            "score": 1.0
          },
          {
            "name": "No extraneous changes to animations or transitions on any slide",
            "description": "Checks that no animation or transition was added, removed, or modified in the presentation.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    # Check ppt_diff for any animation/transition changes\n    if ppt_diff.added_animations or ppt_diff.removed_animations or ppt_diff.modified_animations:\n        return (\"Extraneous animation change(s) detected.\", 0.0)\n    if ppt_diff.added_transitions or ppt_diff.removed_transitions or ppt_diff.modified_transitions:\n        return (\"Extraneous transition change(s) detected.\", 0.0)\n    return (\"No extraneous animation or transition changes.\", 1.0)\n"
            },
            "score": 1.0
          }
        ],
        "score": 1.0
      },
      {
        "name": "No extraneous slides added or removed",
        "description": "Checks that no slides were added or removed in the presentation.",
        "is_critical": false,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # Use ppt_diff for slide structure changes\n    if ppt_diff.added_slides:\n        return (\"Slides were added, which is extraneous.\", 0.0)\n    if ppt_diff.removed_slides:\n        return (\"Slides were removed, which is extraneous.\", 0.0)\n    return (\"No extraneous slides added or removed.\", 1.0)\n"
        },
        "score": 1.0
      }
    ],
    "score": 1.0
  },
  "metadata": {
    "task": "Slide 8: Replace the IP address and Port number bullet points with a simple diagram showing IP address (128.119.245.12) connected to Port 80"
  }
}