{
  "root": {
    "name": "Resize historical document image on slide 4 to be 20% smaller while maintaining aspect ratio",
    "description": "Evaluates whether the agent successfully resized the correct image on slide 4 to be 20% smaller, preserved its aspect ratio, and avoided extraneous changes.",
    "is_critical": true,
    "metadata": {},
    "children": [
      {
        "name": "Resizing accuracy",
        "description": "Checks that the image was resized to be 20% smaller in both width and height, and that the aspect ratio was maintained.",
        "is_critical": true,
        "metadata": {},
        "children": [
          {
            "name": "Image is 20% smaller",
            "description": "Checks that the image size after modification is approximately 80% of its original size.",
            "is_critical": true,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    \"\"\"Evaluate whether the (single) image on slide 4 was resized to ~80%.\n\n    More robust than the original implementation:\n    - Uses MSO_SHAPE_TYPE constants instead of magic number 13.\n    - Falls back to any shape exposing an `.image` attribute (covers placeholders / groups).\n    - Gracefully handles the single-image case (expected here) but will still work\n      if multiple pictures exist by picking the one that changed size the most.\n    \"\"\"\n    from pptx import Presentation\n    from pptx.enum.shapes import MSO_SHAPE_TYPE\n\n    # Expect these to be injected by the evaluation framework.\n    try:\n        orig_path = original_ppt_path  # type: ignore[name-defined]\n        mod_path = modified_ppt_path  # type: ignore[name-defined]\n    except NameError:\n        return \"Internal error: required global paths not provided.\", 0.0\n\n    prs_orig = Presentation(orig_path)\n    prs_mod = Presentation(mod_path)\n\n    # Slide 4 (0-based index 3)\n    try:\n        slide_orig = prs_orig.slides[3]\n        slide_mod = prs_mod.slides[3]\n    except IndexError:\n        return \"Slide 4 not found in one of the presentations.\", 0.0\n\n    def collect_pictures(slide):\n        pics = []\n        for sh in slide.shapes:\n            try:\n                if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:\n                    pics.append(sh)\n                # Some pictures may appear as placeholders with an image\n                elif sh.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(sh, \"image\"):\n                    pics.append(sh)\n                # Fallback: any shape with an image attr (e.g., inside groups)\n                elif hasattr(sh, \"image\"):\n                    pics.append(sh)\n            except Exception:\n                continue\n        return pics\n\n    orig_imgs = collect_pictures(slide_orig)\n    mod_imgs = collect_pictures(slide_mod)\n\n    if not orig_imgs and not mod_imgs:\n        return \"No images found on slide 4.\", 0.0\n    if not orig_imgs:\n        return \"No image found on original slide 4 (cannot assess resize).\", 0.0\n    if not mod_imgs:\n        return \"No image found on modified slide 4 (cannot assess resize).\", 0.0\n\n    # Build dicts keyed by (width,height) to help match; fallback to index order.\n    orig_sizes = [(img.width, img.height) for img in orig_imgs]\n    mod_sizes = [(img.width, img.height) for img in mod_imgs]\n\n    # Find candidate pairs and compute resize deltas.\n    # If counts differ, compare each original to each modified and pick best size-change near 80%.\n    best_msg = \"No image resized.\"  # default\n    best_score = 0.0\n\n    def score_resize(ow, oh, mw, mh):\n        target_w = ow * 0.8\n        target_h = oh * 0.8\n        w_err = abs(mw - target_w) / ow\n        h_err = abs(mh - target_h) / oh\n        within = (w_err < 0.05) and (h_err < 0.05)\n        if within:\n            return 1.0, \"Image resized to approximately 80% of original size.\"\n        # Linear decay: perfect (0 err) ->1, err=0.1 ->0.5, worse -> down to 0 (capped at 0.5 minimum blend like before)\n        w_lin = max(0.0, 1 - (w_err / 0.1))\n        h_lin = max(0.0, 1 - (h_err / 0.1))\n        base = min(w_lin, h_lin)\n        score = base # min(1.0, 0.5 + 0.5 * base)\n        return score, f\"Image resizing error: width {mw}/{ow}, height {mh}/{oh}.\"\n\n    if len(orig_sizes) == 1 and len(mod_sizes) == 1:\n        (ow, oh), (mw, mh) = orig_sizes[0], mod_sizes[0]\n        if (ow, oh) == (mw, mh):\n            return \"Image not resized (same dimensions).\", 0.0\n        best_score, best_msg = score_resize(ow, oh, mw, mh)\n        return best_msg, best_score\n\n    # Multiple images: evaluate all pairings\n    for ow, oh in orig_sizes:\n        for mw, mh in mod_sizes:\n            if (ow, oh) == (mw, mh):\n                continue  # unchanged pair\n            score, msg = score_resize(ow, oh, mw, mh)\n            if score > best_score:\n                best_score, best_msg = score, msg\n\n    return best_msg, best_score\n"
            }
          },
          {
            "name": "Aspect ratio maintained",
            "description": "Checks that the aspect ratio of the image was not changed.",
            "is_critical": true,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    \"\"\"Check that the resized image on slide 4 preserves aspect ratio.\n\n    Improvements over the original implementation:\n    - Robust picture detection (PICTURE, PLACEHOLDER with image, or any shape w/ `image`).\n    - Graceful handling of missing globals / slides.\n    - Supports multiple images (chooses the one that changed size the most).\n    - Clear scoring: full credit if resized and aspect ratio maintained within 2%.\n      Otherwise linear decay to 0.5 at 10% distortion, then further down to 0 if worse.\n    - If no resize occurred, we return a neutral failure (score 0) because the rubric\n      criterion is specifically about maintaining AR during resize.\n    \"\"\"\n    from pptx import Presentation\n    from pptx.enum.shapes import MSO_SHAPE_TYPE\n\n    # Expect global variables injected by framework\n    try:\n        orig_path = original_ppt_path  # type: ignore[name-defined]\n        mod_path = modified_ppt_path  # type: ignore[name-defined]\n    except NameError:\n        return \"Internal error: required global paths not provided.\", 0.0\n\n    try:\n        prs_orig = Presentation(orig_path)\n        prs_mod = Presentation(mod_path)\n    except Exception:\n        return \"Failed to open one of the PPT files.\", 0.0\n\n    # Slide 4 is index 3\n    try:\n        slide_orig = prs_orig.slides[3]\n        slide_mod = prs_mod.slides[3]\n    except IndexError:\n        return \"Slide 4 not found in one of the presentations.\", 0.0\n\n    def collect_pictures(slide):\n        pics = []\n        for sh in slide.shapes:\n            try:\n                if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:\n                    pics.append(sh)\n                elif sh.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(sh, \"image\"):\n                    pics.append(sh)\n                elif hasattr(sh, \"image\"):\n                    pics.append(sh)\n            except Exception:\n                continue\n        return pics\n\n    orig_imgs = collect_pictures(slide_orig)\n    mod_imgs = collect_pictures(slide_mod)\n\n    if not orig_imgs and not mod_imgs:\n        return \"No images found on slide 4.\", 0.0\n    if not orig_imgs:\n        return \"No image on original slide 4 (cannot assess aspect ratio).\", 0.0\n    if not mod_imgs:\n        return \"No image on modified slide 4 (cannot assess aspect ratio).\", 0.0\n\n    # Gather sizes\n    orig_sizes = [(img.width, img.height) for img in orig_imgs]\n    mod_sizes = [(img.width, img.height) for img in mod_imgs]\n\n    # Helper to compute score for a pair\n    def assess_pair(ow, oh, mw, mh):\n        if oh == 0 or mh == 0:\n            return 0.0, \"Image height is zero, cannot compute aspect ratio.\"\n        if (ow, oh) == (mw, mh):\n            return 0.0, \"Image not resized (aspect ratio criterion not satisfied).\"\n        orig_ar = ow / oh\n        mod_ar = mw / mh\n        ar_diff = abs(orig_ar - mod_ar)\n        # 2% tolerance for 'maintained'\n        if ar_diff / orig_ar < 0.02:\n            return 1.0, \"Aspect ratio maintained during resize.\"\\\n        \n        # Linear decay: at 0 diff ->1 (already handled), at 10% diff ->0.5, beyond -> down.\n        rel_diff = ar_diff / orig_ar\n        lin = max(0.0, 1 - (rel_diff / 0.1))  # rel_diff==0.1 => lin=0\n        score = lin # min(1.0, 0.5 + 0.5 * lin)  # maps lin=0 ->0.5, lin=1 ->1\n        return score, f\"Aspect ratio changed: original {orig_ar:.4f}, modified {mod_ar:.4f}.\"\n\n    # If exactly one image each, just compare\n    if len(orig_sizes) == 1 and len(mod_sizes) == 1:\n        (ow, oh), (mw, mh) = orig_sizes[0], mod_sizes[0]\n        score, msg = assess_pair(ow, oh, mw, mh)\n        return msg, score\n\n    # Multiple images: choose pairing that yields highest score (most correct resize)\n    best_score = 0.0\n    best_msg = \"No qualifying resized image found (aspect ratio check failed).\"\n    for ow, oh in orig_sizes:\n        for mw, mh in mod_sizes:\n            score, msg = assess_pair(ow, oh, mw, mh)\n            if score > best_score:\n                best_score, best_msg = score, msg\n                if best_score == 1.0:\n                    return best_msg, best_score\n\n    return best_msg, best_score\n"
            }
          }
        ]
      },
      {
        "name": "No extraneous changes made",
        "description": "Checks that no changes unrelated to resizing the historical document image on slide 4 were made.",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "No extraneous changes to other slides",
            "description": "Checks that no elements on slides other than slide 4 were added, removed, or modified.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    # Check PPTDiff for changes outside slide 4\n    slide_nums_to_ids = {s.slide_number: s.slide_id for s in ppt_diff.added_slides + ppt_diff.removed_slides}\n    # Check added/removed/modified slides\n    changes = []\n    for s in ppt_diff.added_slides + ppt_diff.removed_slides:\n        if s.slide_number != 4:\n            changes.append(f\"Slide {s.slide_number} was added/removed.\")\n    for orig, mod in ppt_diff.modified_slides:\n        if orig.slide_number != 4:\n            changes.append(f\"Slide {orig.slide_number} was modified.\")\n    # Check animations/transitions\n    for a in ppt_diff.added_animations + ppt_diff.removed_animations:\n        if a.slide_id and a.slide_id != ppt_diff.modified_slides[0][1].slide_id:\n            changes.append(f\"Animation changed on slide {a.slide_id}.\")\n    for t in ppt_diff.added_transitions + ppt_diff.removed_transitions:\n        if t.slide_id and t.slide_id != ppt_diff.modified_slides[0][1].slide_id:\n            changes.append(f\"Transition changed on slide {t.slide_id}.\")\n    if changes:\n        return \"Extraneous changes to other slides detected: \" + \"; \".join(changes), 0.0\n    return \"No extraneous changes to other slides.\", 1.0\n"
            }
          },
          {
            "name": "No extraneous changes to slide 4 elements except the image",
            "description": "Checks that no elements other than the historical document image on slide 4 were added, removed, or modified.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    \"\"\"Verify that nothing except the image(s) changed on slide 4.\n\n    Criteria for passing (score=1):\n      - All non-image shapes present in the original are present in the modified.\n      - No non-image shape changed its geometry (left/top/width/height beyond tiny tolerance).\n      - No non-image shape changed its text content.\n      - We ignore any changes to picture shapes (size/position) because image modification is allowed.\n\n    Otherwise score=0 with an explanatory message.\n    \"\"\"\n    from pptx import Presentation\n    from pptx.enum.shapes import MSO_SHAPE_TYPE\n\n    # Expect global variables injected by framework\n    try:\n        orig_path = original_ppt_path  # type: ignore[name-defined]\n        mod_path = modified_ppt_path  # type: ignore[name-defined]\n    except NameError:\n        return \"Internal error: required global paths not provided.\", 0.0\n\n    try:\n        prs_orig = Presentation(orig_path)\n        prs_mod = Presentation(mod_path)\n    except Exception:\n        return \"Failed to open one of the PPT files.\", 0.0\n\n    # Slide 4 index = 3\n    try:\n        slide_orig = prs_orig.slides[3]\n        slide_mod = prs_mod.slides[3]\n    except IndexError:\n        return \"Slide 4 not found in one of the presentations.\", 0.0\n\n    def is_picture(sh):\n        try:\n            if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:\n                return True\n            if sh.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(sh, \"image\"):\n                return True\n            if hasattr(sh, \"image\"):\n                return True\n        except Exception:\n            return False\n        return False\n\n    # Collect non-image shapes\n    orig_non_imgs = [sh for sh in slide_orig.shapes if not is_picture(sh)]\n    mod_non_imgs = [sh for sh in slide_mod.shapes if not is_picture(sh)]\n\n    # Quick cardinality check\n    if len(orig_non_imgs) != len(mod_non_imgs):\n        return \"Non-image elements on slide 4 were added or removed.\", 0.0\n\n    # Build a multiset of descriptors for robust matching (order can differ)\n    def shape_descriptor(sh):\n        # Some shapes may lack certain attrs; use getattr with default\n        left = getattr(sh, \"left\", None)\n        top = getattr(sh, \"top\", None)\n        width = getattr(sh, \"width\", None)\n        height = getattr(sh, \"height\", None)\n        # Normalize text (strip whitespace); if no text attribute use None\n        text = getattr(sh, \"text\", None)\n        if isinstance(text, str):\n            text = \" \".join(text.split())  # collapse whitespace\n        stype = getattr(sh, \"shape_type\", None)\n        return {\n            \"shape_type\": stype,\n            \"left\": left,\n            \"top\": top,\n            \"width\": width,\n            \"height\": height,\n            \"text\": text,\n        }\n\n    orig_descs = [shape_descriptor(sh) for sh in orig_non_imgs]\n    mod_descs = [shape_descriptor(sh) for sh in mod_non_imgs]\n\n    # Because EMU values can have tiny differences, allow small tolerance.\n    POS_TOL = 1000  # ~0.001 inch (approx); adjust as needed\n    SIZE_TOL = 1000\n\n    def close(a, b, tol):\n        if a is None or b is None:\n            return a == b\n        try:\n            return abs(int(a) - int(b)) <= tol\n        except Exception:\n            return a == b\n\n    # Attempt to match each original descriptor with exactly one modified descriptor.\n    used = [False] * len(mod_descs)\n    for o in orig_descs:\n        match_i = -1\n        for i, m in enumerate(mod_descs):\n            if used[i]:\n                continue\n            if o[\"shape_type\"] != m[\"shape_type\"]:\n                continue\n            if not close(o[\"left\"], m[\"left\"], POS_TOL):\n                continue\n            if not close(o[\"top\"], m[\"top\"], POS_TOL):\n                continue\n            if not close(o[\"width\"], m[\"width\"], SIZE_TOL):\n                continue\n            if not close(o[\"height\"], m[\"height\"], SIZE_TOL):\n                continue\n            if o[\"text\"] != m[\"text\"]:\n                continue\n            match_i = i\n            break\n        if match_i == -1:\n            return \"A non-image element was changed (geometry or text).\", 0.0\n        used[match_i] = True\n\n    # All non-image shapes matched unchanged -> pass\n    return \"Only image changes detected; no extraneous modifications to other elements on slide 4.\", 1.0\n"
            }
          }
        ]
      }
    ]
  },
  "metadata": {
    "task": "Resize the historical document image on slide 4 to be 20% smaller while maintaining aspect ratio.",
    "compute_strategy": "default",
    "critical_node_weight": 0.7
  }
}