{
  "root": {
    "name": "Format Last Three Bullets on Slide 2 as Numbered List",
    "description": "Evaluate if, on slide 2, the last three bullets describing types of worms are formatted together as a numbered list, with no extraneous changes.",
    "is_critical": true,
    "metadata": {},
    "children": [
      {
        "name": "Bullets Detailing Types of Worms Are Detected and Numbered",
        "description": "Checks that the three correct bullets are detected and now form a single numbered list on slide 2.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    mod_img = None\n    for s in modified_ppt_screenshots:\n        if s.slide_number == 2:\n            mod_img = s.image_path\n            break\n    if not mod_img:\n        return (\"Slide 2 screenshot missing in modified.\", 0.0)\n    \n    prompt = (\n        \"Carefully analyize this image of a PowerPoint slide. \"\n        \"Is it the case that the paragraphs with keywords \\\"flatworms\\\", \\\"segmented worms\\\", and \\\"roundworms\\\" are part of a numbered list? \"\n        \"If the paragraphs in question are part of a numbered list, answer YES. Otherwise (say, for example, they make up an unordered bulleted list), answer NO and briefly describe the issue.\"\n    )\n    vlm_resp = vlm_call(prompt, [mod_img], temperature=0.0, max_tokens=128).strip().lower()\n    if vlm_resp.startswith('yes'):\n        return (\"The relevant paragraphs make up a numbered list, as required.\", 1.0)\n    else:\n        return (f\"The relevant paragraphs DO NOT make up a numbered list: {vlm_resp}\", 0.0)\n"
        },
        "score": 1.0
      },
      {
        "name": "No Extraneous Changes to Slide Content",
        "description": "Checks that no other content on slide 2 has been altered (besides the bullet formatting).",
        "is_critical": false,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # Use screenshots for visual diff since text/shape-level diffs are not fully reliable\n    base_img = next((s.image_path for s in original_ppt_screenshots if s.slide_number == 2), None)\n    mod_img = next((s.image_path for s in modified_ppt_screenshots if s.slide_number == 2), None)\n    if not base_img or not mod_img:\n        return (\"Missing slide screenshots.\", 0.0)\n    prompt = (\n        \"Compare these two images of the same PowerPoint slide. Ignore any difference bullet types/styles (e.g. whether the bullet styles are circle bullets, square bullets, numbers, etc.). \"\n        \"Are there *any* other visible changes to the slide's content, layout, or appearance (such as text, images, shapes, or formatting)? \"\n        \"If there are no other visible changes except bullet style, answer YES. Otherwise, answer NO and briefly describe the differences.\"\n    )\n    vlm_resp = vlm_call(prompt, [base_img, mod_img], temperature=0.0, max_tokens=128).strip().lower()\n    if vlm_resp.startswith('yes'):\n        return (\"No extraneous visual changes to Slide 2 besides bullet style.\", 1.0)\n    else:\n        return (f\"Visual difference(s) detected on Slide 2 besides bullet style: {vlm_resp}\", 0.0)"
        },
        "score": 1.0
      },
      {
        "name": "No Extraneous Changes to Other Slides",
        "description": "Verifies that slides other than slide 2 remain unchanged.",
        "is_critical": false,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    from PIL import Image, ImageChops\n    import numpy as np\n    ok_count = 0\n    total = 0\n    for orig, mod in zip(original_ppt_screenshots, modified_ppt_screenshots):\n        if orig.slide_number == 2:\n            continue\n        total += 1\n        img1 = Image.open(orig.image_path).convert('RGB')\n        img2 = Image.open(mod.image_path).convert('RGB')\n        diff = ImageChops.difference(img1, img2)\n        arr = np.array(diff)\n        changed_pixels = np.sum(np.any(arr > 40, axis=-1))\n        total_pixels = arr.shape[0]*arr.shape[1]\n        pct_change = changed_pixels / total_pixels\n        if pct_change <= 0.01:\n            ok_count += 1\n    if total == 0:\n        return (\"No other slides to check.\", 1.0)\n    score = ok_count / total\n    if score < 1.0:\n        return (f\"Some other slides were modified.\", score)\n    return (\"No extraneous changes to other slides.\", 1.0)\n"
        },
        "score": 1.0
      }
    ],
    "score": 1.0
  },
  "metadata": {
    "task": "In slide 2, format the last three bullets which detail the types of worms as a numbered list"
  }
}