{
  "root": {
    "name": "Agent inserts a black calculator icon next to the equation on slide 3",
    "description": "Evaluates whether the agent correctly inserted a black calculator icon next to the equation on slide 3, ensuring proper placement, appearance, and avoiding extraneous changes.",
    "is_critical": true,
    "metadata": {},
    "children": [
      {
        "name": "Calculator icon is present on slide 3",
        "description": "Checks that a new icon resembling a calculator has been added to slide 3.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    from pptx.enum.shapes import MSO_SHAPE_TYPE\n    \n    # Check slide 3 for calculator icon (picture/graphic shapes)\n    ppt = Presentation(modified_ppt_path)\n    if len(ppt.slides) <= 2:\n        return (\"Slide 3 does not exist.\", 0.0)\n    \n    slide = ppt.slides[2]  # Slide 3 (0-indexed)\n    \n    # Look for picture/graphic shapes that could be calculator icons\n    calculator_candidates = []\n    for shape in slide.shapes:\n        # Check for picture shapes\n        if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:\n            calculator_candidates.append(shape)\n        # Also check names/alt text for calculator references\n        elif (hasattr(shape, 'name') and 'calc' in shape.name.lower()) or \\\n             (hasattr(shape, 'alternative_text') and shape.alternative_text and 'calc' in shape.alternative_text.lower()):\n            calculator_candidates.append(shape)\n    \n    if calculator_candidates:\n        return f\"Calculator icon found on slide 3 ({len(calculator_candidates)} picture/graphic shape(s) detected).\", 1.0\n    \n    # Fallback: use VLM to check the slide screenshot\n    slide_screenshot = None\n    for s in modified_ppt_screenshots:\n        if s.slide_number == 3:\n            slide_screenshot = s.image_path\n            break\n    \n    if slide_screenshot:\n        prompt = \"Is there a calculator icon visible on this slide? Answer yes or no.\"\n        try:\n            response = vlm_call(prompt, [slide_screenshot], temperature=0.2, max_tokens=20)\n            if \"yes\" in response.lower():\n                return \"Calculator icon detected by visual analysis on slide 3.\", 1.0\n        except:\n            pass\n    \n    return \"No calculator icon detected on slide 3.\", 0.0\n"
        }
      },
      {
        "name": "Calculator icon appearance: black and visually resembles a calculator",
        "description": "Checks that the inserted icon is predominantly black and actually visually represents a calculator (not a different kind of icon).",
        "is_critical": false,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    from pptx.enum.shapes import MSO_SHAPE_TYPE\n    \n    # Check slide 3 for calculator icon appearance\n    ppt = Presentation(modified_ppt_path)\n    if len(ppt.slides) <= 2:\n        return (\"Slide 3 does not exist.\", 0.0)\n    \n    slide = ppt.slides[2]  # Slide 3 (0-indexed)\n    \n    # Look for picture shapes that could be calculator icons\n    calculator_candidates = []\n    for shape in slide.shapes:\n        if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:\n            calculator_candidates.append(shape)\n    \n    # Since color detection from shape properties is complex, use VLM for appearance check\n    slide_screenshot = None\n    for s in modified_ppt_screenshots:\n        if s.slide_number == 3:\n            slide_screenshot = s.image_path\n            break\n    \n    if slide_screenshot:\n        prompt = \"Is there a black calculator icon visible on this slide? Please answer yes only if you can see a calculator icon that appears to be black in color.\"\n        try:\n            response = vlm_call(prompt, [slide_screenshot], temperature=0.25, max_tokens=10)\n            if \"yes\" in response.lower():\n                return \"VLM confirms black calculator icon appearance.\", 1.0\n            else:\n                return f\"Calculator icon appearance not confirmed as black (VLM response: {response}).\", 0.6\n        except Exception as e:\n            pass\n    \n    # Fallback: if we found calculator candidates, give partial credit\n    if calculator_candidates:\n        return f\"Calculator icon found but cannot verify color/appearance ({len(calculator_candidates)} candidate(s)).\", 0.7\n    \n    return \"No calculator icon detected for appearance verification.\", 0.0\n"
        }
      },
      {
        "name": "Icon is positioned next to the equation on slide 3",
        "description": "Checks that the black calculator icon is placed near (to either side of) the equation on slide 3, and not far away or overlapping other elements.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # Can't localize equation shape programmatically, so fallback to vision model\n    slide_img = None\n    for s in modified_ppt_screenshots:\n        if s.slide_number == 3:\n            slide_img = s.image_path\n            break\n    if not slide_img:\n        return (\"No slide 3 screenshot for position check.\", 0.0)\n    prompt = (\n        \"Is the black calculator icon placed either on the left or right of the mathematical equation (within 1 cm) on the slide? \"\n        \"If so, answer yes. Otherwise, answer no.\"\n    )\n    resp = vlm_call(prompt, [slide_img], temperature=0.25, max_tokens=10)\n    if \"yes\" in resp.lower():\n        return (\"VLM confirms correct placement of calculator icon.\", 1.0)\n    return (\"Calculator icon not positioned next to equation.\", 0.0)\n"
        }
      },
      {
        "name": "No extraneous content changes",
        "description": "Ensures that no unrelated icons, images, or major modifications were made on slide 3 or other slides, apart from inserting the calculator icon.",
        "is_critical": false,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    \n    # Only the calculator icon should be added, no other significant extras\n    ppt = Presentation(modified_ppt_path)\n    ppt_orig = Presentation(original_ppt_path)\n    \n    # 1. Check number of shapes added to slide 3 (should be just one for calculator)\n    if len(ppt.slides) <= 2 or len(ppt_orig.slides) <= 2:\n        return (\"Slide 3 missing in one of the presentations.\", 0.0)\n        \n    slide3 = ppt.slides[2]\n    orig_slide3 = ppt_orig.slides[2]\n    added_shapes = len(slide3.shapes) - len(orig_slide3.shapes)\n    \n    if added_shapes > 1:\n        return f\"Too many shapes added to slide 3 (expected 1 calculator icon, got {added_shapes} additions).\", 0.0\n    elif added_shapes == 0:\n        return \"No shapes were added to slide 3 (calculator icon missing).\", 0.0\n    \n    # 2. Check for changes to other slides (should be minimal)\n    slides_with_changes = 0\n    for i in range(len(ppt.slides)):\n        if i == 2:  # Skip slide 3 as we expect changes there\n            continue\n        if i < len(ppt_orig.slides):\n            if len(ppt.slides[i].shapes) != len(ppt_orig.slides[i].shapes):\n                slides_with_changes += 1\n    \n    if slides_with_changes > 0:\n        return f\"Unexpected changes detected on {slides_with_changes} other slide(s).\", 0.2\n    \n    # 3. Visual check for extraneous content on slide 3\n    slide_screenshot = None\n    for s in modified_ppt_screenshots:\n        if s.slide_number == 3:\n            slide_screenshot = s.image_path\n            break\n    \n    if slide_screenshot:\n        prompt = \"Looking at this slide, is there only one new calculator icon added, or are there other new unrelated icons, images, or content? Answer 'calculator only' if just a calculator was added, or 'other content' if there are additional unrelated items.\"\n        try:\n            response = vlm_call(prompt, [slide_screenshot], temperature=0.2, max_tokens=15)\n            if \"calculator only\" in response.lower() or (\"calculator\" in response.lower() and \"only\" in response.lower()):\n                return \"Visual check confirms only calculator icon was added.\", 1.0\n            elif \"other content\" in response.lower():\n                return f\"Visual check detects additional content beyond calculator icon.\", 0.3\n        except:\n            pass\n    \n    # Default: if basic shape count check passed, give benefit of doubt\n    return \"No major extraneous content detected (one shape added to slide 3 as expected).\", 0.9\n"
        }
      }
    ]
  },
  "metadata": {
    "task": "Insert a black calculator icon next to the equation on slide 3"
  }
}