{
  "root": {
    "name": "Task Completion: Make fourth and fifth items in numbered list on slide 12 consistent with first three",
    "description": "Evaluates whether the agent successfully made the fourth and fifth items in the numbered list on slide 12 look consistent with the first three items, without making unwanted changes elsewhere",
    "is_critical": false,
    "metadata": {},
    "children": [
      {
        "name": "Attempt Detection",
        "description": "Critical check to verify that some attempt was made to modify the fourth and fifth items in the numbered list on slide 12",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    \n    # Load both presentations to compare\n    try:\n        original_ppt = Presentation(original_ppt_path)\n        modified_ppt = Presentation(modified_ppt_path)\n        \n        # Get slide 12 (index 11)\n        if len(original_ppt.slides) < 12 or len(modified_ppt.slides) < 12:\n            return \"Slide 12 not found in one or both presentations\", 0.0\n            \n        original_slide = original_ppt.slides[11]\n        modified_slide = modified_ppt.slides[11]\n        \n        # Check if there are any modifications to slide 12\n        slide_12_modified = False\n        for modified_slide_info in ppt_diff.modified_slides:\n            if modified_slide_info[1].slide_number == 12:\n                slide_12_modified = True\n                break\n        \n        # Also check if slide 12 appears in screenshots with differences\n        original_slide_12 = None\n        modified_slide_12 = None\n        \n        for screenshot in original_ppt_screenshots:\n            if screenshot.slide_number == 12:\n                original_slide_12 = screenshot\n                break\n                \n        for screenshot in modified_ppt_screenshots:\n            if screenshot.slide_number == 12:\n                modified_slide_12 = screenshot\n                break\n        \n        # Use VLM to check if there are visual differences on slide 12\n        if original_slide_12 and modified_slide_12:\n            prompt = \"\"\"Compare these two versions of slide 12. Look specifically for any changes to numbered list items, particularly the 4th and 5th items. \n            \n            Has there been any attempt to modify the formatting, styling, or appearance of items in a numbered list on this slide?\n            \n            Answer with 'YES' if you can see any changes that appear to be formatting modifications to list items, or 'NO' if the slides look identical.\"\"\"\n            \n            response = vlm_call(prompt, [original_slide_12.image_path, modified_slide_12.image_path], temperature=0.1)\n            \n            if \"YES\" in response.upper():\n                return \"Visual changes detected on slide 12, indicating an attempt was made\", 1.0\n            else:\n                return \"No visual changes detected on slide 12, no attempt appears to have been made\", 0.0\n        \n        # Fallback: if we detected slide modification in diff\n        if slide_12_modified:\n            return \"Slide 12 was modified according to diff analysis\", 1.0\n        else:\n            return \"No modifications detected to slide 12\", 0.0\n            \n    except Exception as e:\n        return f\"Error analyzing presentations: {str(e)}\", 0.0\n"
        }
      },
      {
        "name": "Consistency Achievement",
        "description": "Evaluates how well the fourth and fifth items in the numbered list were made consistent with the first three items using visual assessment",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    # Get slide 12 screenshot\n    slide_12_screenshot = None\n    for screenshot in modified_ppt_screenshots:\n        if screenshot.slide_number == 12:\n            slide_12_screenshot = screenshot\n            break\n    \n    if slide_12_screenshot is None:\n        return \"Slide 12 not found in screenshots\", 0.0\n    \n    prompt = \"\"\"Look at this PowerPoint slide and examine the numbered list items. \n    \nI need you to evaluate whether items 4 and 5 in the numbered list look visually consistent with items 1, 2, and 3 in terms of:\n    - Font formatting (size, style, color, bold/italic)\n    - Text alignment and indentation\n    - Bullet/numbering style\n    - Spacing and layout\n    - Any other visual formatting elements\n    \nPlease provide a detailed analysis of the consistency and rate it on a scale of 0-1 where:\n    - 1.0 = Items 4 and 5 are completely consistent with items 1-3\n    - 0.8-0.9 = Very consistent with only minor differences\n    - 0.6-0.7 = Mostly consistent with some noticeable differences\n    - 0.4-0.5 = Somewhat consistent but several formatting differences\n    - 0.2-0.3 = Inconsistent with major formatting differences\n    - 0.0-0.1 = Completely inconsistent or items not found\n    \nRespond with your analysis followed by 'SCORE: X.X' where X.X is your numerical rating.\"\"\"\n    \n    response = vlm_call(prompt, [slide_12_screenshot.image_path], temperature=0.3)\n    \n    # Extract score from response\n    score_line = [line for line in response.split('\\n') if 'SCORE:' in line.upper()]\n    if score_line:\n        try:\n            score = float(score_line[-1].split(':')[-1].strip())\n            score = max(0.0, min(1.0, score))  # Clamp between 0 and 1\n            return f\"VLM assessment: {response}\", score\n        except:\n            pass\n    \n    # Fallback scoring based on keywords in response\n    response_lower = response.lower()\n    if 'completely consistent' in response_lower or 'fully consistent' in response_lower:\n        return f\"VLM assessment indicates high consistency: {response}\", 0.9\n    elif 'very consistent' in response_lower or 'mostly consistent' in response_lower:\n        return f\"VLM assessment indicates good consistency: {response}\", 0.7\n    elif 'somewhat consistent' in response_lower:\n        return f\"VLM assessment indicates partial consistency: {response}\", 0.5\n    elif 'inconsistent' in response_lower:\n        return f\"VLM assessment indicates inconsistency: {response}\", 0.2\n    else:\n        return f\"VLM assessment unclear: {response}\", 0.5\n"
        }
      },
      {
        "name": "No Extraneous Changes",
        "description": "Verifies that no unwanted changes were made to other slides or to text content on slide 12",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "No Changes to Other Slides",
            "description": "Ensures that slides other than slide 12 were not modified",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    # Check if any slides other than slide 12 were modified\n    other_slides_modified = []\n    \n    for modified_slide_info in ppt_diff.modified_slides:\n        if modified_slide_info[1].slide_number != 12:\n            other_slides_modified.append(modified_slide_info[1].slide_number)\n    \n    # Check for added or removed slides\n    slides_added = len(ppt_diff.added_slides)\n    slides_removed = len(ppt_diff.removed_slides)\n    \n    if slides_added > 0 or slides_removed > 0:\n        return f\"Slides were added ({slides_added}) or removed ({slides_removed})\", 0.0\n    \n    if other_slides_modified:\n        return f\"Other slides were modified: {other_slides_modified}\", 0.0\n    \n    return \"No changes detected to slides other than slide 12\", 1.0\n"
            }
          },
          {
            "name": "No Text Content Changes on Slide 12",
            "description": "Ensures that only formatting/styling was changed on slide 12, not the actual text content",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    \n    try:\n        original_ppt = Presentation(original_ppt_path)\n        modified_ppt = Presentation(modified_ppt_path)\n        \n        if len(original_ppt.slides) < 12 or len(modified_ppt.slides) < 12:\n            return \"Slide 12 not found in one or both presentations\", 0.0\n            \n        original_slide = original_ppt.slides[11]\n        modified_slide = modified_ppt.slides[11]\n        \n        # Extract all text from both slides\n        def extract_text_from_slide(slide):\n            texts = []\n            for shape in slide.shapes:\n                if hasattr(shape, 'text'):\n                    texts.append(shape.text.strip())\n            return texts\n        \n        original_texts = extract_text_from_slide(original_slide)\n        modified_texts = extract_text_from_slide(modified_slide)\n        \n        # Normalize whitespace and compare\n        original_normalized = [' '.join(text.split()) for text in original_texts if text]\n        modified_normalized = [' '.join(text.split()) for text in modified_texts if text]\n        \n        original_normalized.sort()\n        modified_normalized.sort()\n        \n        if original_normalized == modified_normalized:\n            return \"Text content on slide 12 remains unchanged\", 1.0\n        else:\n            return f\"Text content changed on slide 12. Original: {len(original_normalized)} items, Modified: {len(modified_normalized)} items\", 0.0\n            \n    except Exception as e:\n        return f\"Error comparing text content: {str(e)}\", 0.5\n"
            }
          }
        ]
      }
    ]
  },
  "metadata": {
    "task": "On slide 12, make the fourth and fifth items in the numbered list look consistent with the first three. Context for rubric design: 1. Critical node, checking if at least some attempt seems to have been made to make the fourth and fifth items in the numbered list look consistent with the first three. 2. Non-critical node, use VLM to check with partial grading to check that the fourth and fifth items in the numbered list look consistent with the first three. 3. Non-critical, make sure no changes to other slides or text content changes to slide 12."
  }
}