{
  "root": {
    "name": "Add a bullet point with a fact about worm reproduction in slide 5",
    "description": "Evaluates whether a bullet point containing a fact about worm reproduction was correctly added to slide 5, and that no extraneous or undesired changes were made.",
    "is_critical": true,
    "metadata": {},
    "children": [
      {
        "name": "Bullet point added to slide 5",
        "description": "Checks that a new bullet point was added to slide 5.",
        "is_critical": true,
        "metadata": {},
        "children": [
          {
            "name": "Bullet point exists in slide 5",
            "description": "Verifies that a new bullet point has been added to slide 5.",
            "is_critical": true,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    import difflib\n    \n    # Load slides\n    orig = Presentation(original_ppt_path)\n    mod = Presentation(modified_ppt_path)\n    \n    # Get slide 5 (0-based index)\n    try:\n        orig_slide = orig.slides[4]\n        mod_slide = mod.slides[4]\n    except IndexError:\n        return (\"Slide 5 does not exist in one of the presentations.\", 0.0)\n    \n    # Gather all bullet points text from both original and modified slide 5\n    def get_bullets(slide):\n        bullets = []\n        for shape in slide.shapes:\n            if not shape.has_text_frame:\n                continue\n            for paragraph in shape.text_frame.paragraphs:\n                bullets.append(paragraph.text.strip())\n        return bullets\n    \n    orig_bullets = set(get_bullets(orig_slide))\n    mod_bullets = set(get_bullets(mod_slide))\n    new_bullets = mod_bullets - orig_bullets\n    \n    if not new_bullets:\n        return (\"No new bullet point added to slide 5.\", 0.0)\n    return (\"A new bullet point is present on slide 5.\", 1.0)\n"
            },
            "score": 1.0
          },
          {
            "name": "Bullet point is not a duplicate",
            "description": "Checks that the new bullet point is not a duplicate of existing bullet points on slide 5.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    import difflib\n    \n    orig = Presentation(original_ppt_path)\n    mod = Presentation(modified_ppt_path)\n    orig_slide = orig.slides[4]\n    mod_slide = mod.slides[4]\n    \n    def get_bullets(slide):\n        bullets = []\n        for shape in slide.shapes:\n            if not shape.has_text_frame:\n                continue\n            for paragraph in shape.text_frame.paragraphs:\n                if paragraph.level > 0 or (paragraph.text.strip() and paragraph.text.strip()[0] in '•-–'):\n                    bullets.append(paragraph.text.strip())\n        return bullets\n    \n    orig_bullets = set(get_bullets(orig_slide))\n    mod_bullets = set(get_bullets(mod_slide))\n    new_bullets = mod_bullets - orig_bullets\n    \n    for new_bullet in new_bullets:\n        for bullet in orig_bullets:\n            if difflib.SequenceMatcher(None, new_bullet.lower(), bullet.lower()).ratio() > 0.9:\n                return (f\"Bullet point '{new_bullet}' is a duplicate of '{bullet}'.\", 0.0)\n    return (\"New bullet point is not a duplicate.\", 1.0)\n"
            },
            "score": 1.0
          },
          {
            "name": "Bullet point factuality and relevance",
            "description": "Verifies that the newly added bullet point is a fact and specifically about worm reproduction.",
            "is_critical": true,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    orig = Presentation(original_ppt_path)\n    mod = Presentation(modified_ppt_path)\n    orig_slide = orig.slides[4]\n    mod_slide = mod.slides[4]\n    \n    def get_bullets(slide):\n        bullets = []\n        for shape in slide.shapes:\n            if not shape.has_text_frame:\n                continue\n            for paragraph in shape.text_frame.paragraphs:\n                bullets.append(paragraph.text.strip())\n        return bullets\n\n    orig_bullets = set(get_bullets(orig_slide))\n    mod_bullets = set(get_bullets(mod_slide))\n    new_bullets = list(mod_bullets - orig_bullets)\n    if not new_bullets:\n        return (\"No new bullet to check.\", 0.0)\n\n    # Use LLM to check factuality and relevance\n    for b in new_bullets:\n        prompt = f\"Is the following sentence a factual statement specifically about worm reproduction? Answer yes or no. Bullet: '{b}'\"\n        resp = llm_call(prompt, temperature=0.0, max_tokens=5)\n        if \"yes\" in resp.lower():\n            return (f\"Bullet '{b}' is a factual statement about worm reproduction.\", 1.0)\n    return (\"No new bullet is a factual statement about worm reproduction.\", 0.0)\n"
            },
            "score": 1.0
          }
        ],
        "score": 1.0
      },
      {
        "name": "No extraneous changes",
        "description": "Checks that no unrelated modifications were made to the presentation (other than the added bullet point).",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "No unrelated slide content changes",
            "description": "Verifies that only slide 5 was modified and only by addition of the new bullet point.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    # Check for added/removed/modified slides\n    if ppt_diff.added_slides or ppt_diff.removed_slides:\n        return (\"Slides were added or removed.\", 0.0)\n    # Check for modifications to slides other than slide 5\n    for orig, mod in ppt_diff.modified_slides:\n        if orig.slide_number != 5:\n            return (f\"Slide {orig.slide_number} was modified, but only slide 5 should be modified.\", 0.0)\n    return (\"No unrelated slide content changes.\", 1.0)\n"
            },
            "score": 1.0
          },
          {
            "name": "No changes to animations or transitions",
            "description": "Checks that no animations or transitions were added, removed, or modified in any slide.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    if ppt_diff.added_animations or ppt_diff.removed_animations or ppt_diff.modified_animations:\n        return (\"Animations were changed.\", 0.0)\n    if ppt_diff.added_transitions or ppt_diff.removed_transitions or ppt_diff.modified_transitions:\n        return (\"Slide transitions were changed.\", 0.0)\n    return (\"No animation or transition changes.\", 1.0)\n"
            },
            "score": 1.0
          }
        ],
        "score": 1.0
      }
    ],
    "score": 1.0
  },
  "metadata": {
    "task": "Add a bullet point with a fact about worm reproduction in slide 5"
  }
}