{
  "root": {
    "name": "Change circular bullets to numbered list on final slide",
    "description": "Evaluates whether the agent successfully changed the circular bullets on the final slide to numbered list items, without making unnecessary changes to other slides or content.",
    "is_critical": false,
    "metadata": {},
    "children": [
      {
        "name": "Bullet format changed from circular to numbered",
        "description": "Verifies that the bullet points on the final slide have been changed from circular bullets to numbered list format.",
        "is_critical": true,
        "metadata": {},
        "scorer": {
          "type": "function",
          "function_code": "def compute_score() -> tuple[str, float]:\n    try:\n        original_final = original_ppt_screenshots[-1]\n        modified_final = modified_ppt_screenshots[-1]\n        \n        prompt = '''Compare these two PowerPoint slides. The task was to change the circular bullets on the final slide to numbered list items.\n        \n        Look at both images and determine:\n        1. Are there any circular bullets in the first image (original)?\n        2. Have these circular bullets been changed to numbered list items in the second image (modified)?\n\n        Respond with either \"SUCCESS\" if the red border was successfully changed to blue, or \"FAILURE\" followed by a brief explanation of what you observe.'''\n\n        result = vlm_call(prompt, [original_final.image_path, modified_final.image_path], temperature=0.1)\n        \n        if \"SUCCESS\" in result:\n            return result, 1.0\n        else:\n            return f\"Bullet format change not detected: {result}\", 0.0\n    except Exception as e:\n        return f\"Error analyzing slide images: {str(e)}\", 0.0\n"
        }
      },
      {
        "name": "No extraneous changes made",
        "description": "Ensures that only the required changes were made and no unnecessary modifications were applied to other slides or content.",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "Other slides unchanged",
            "description": "Verifies that slides other than the final slide were not modified unnecessarily.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    \n    try:\n        # Check if any non-final slides were modified\n        total_slides = len(Presentation(modified_ppt_path).slides)\n        \n        extraneous_changes = 0\n        for old_slide, new_slide in ppt_diff.modified_slides:\n            if new_slide.slide_number != total_slides:  # Not the final slide\n                extraneous_changes += 1\n        \n        # Also check for added/removed slides\n        slides_added = len(ppt_diff.added_slides)\n        slides_removed = len(ppt_diff.removed_slides)\n        \n        if extraneous_changes == 0 and slides_added == 0 and slides_removed == 0:\n            return \"No extraneous changes to other slides\", 1.0\n        elif extraneous_changes > 0:\n            return f\"Extraneous changes detected on {extraneous_changes} non-final slides\", 0.0\n        else:\n            return f\"Slides added: {slides_added}, removed: {slides_removed}\", 0.0\n            \n    except Exception as e:\n        return f\"Error checking for extraneous changes: {str(e)}\", 0.5\n"
            }
          },
          {
            "name": "No unnecessary animations or transitions added",
            "description": "Ensures that no new animations or transitions were added during the bullet format change.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    try:\n        animations_added = len(ppt_diff.added_animations)\n        transitions_added = len(ppt_diff.added_transitions)\n        \n        if animations_added == 0 and transitions_added == 0:\n            return \"No unnecessary animations or transitions added\", 1.0\n        else:\n            return f\"Unnecessary additions: {animations_added} animations, {transitions_added} transitions\", 0.0\n            \n    except Exception as e:\n        return f\"Error checking animations/transitions: {str(e)}\", 0.5\n"
            }
          },
          {
            "name": "No other changes made to final slide",
            "description": "Ensures that no extraneous changes were made to the final slide.",
            "is_critical": false,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    try:\n        original_final = original_ppt_screenshots[-1]\n        modified_final = modified_ppt_screenshots[-1]\n        \n        prompt = '''Compare these two PowerPoint slides. The task was to change the circular bullets on the final slide to numbered list items.\n    \n        Look at both images and determine:\n        1. GOOD: If only the circular bullets have been changed to numbered list items.\n        2. PARTIAL: If both the circular and hyphenated bullets have been changed to numbered list items.\n        3. FAILURE: If other changes have been made to the slide.\n        Respond with either \"GOOD\", \"PARTIAL\", or \"FAILURE\" followed by a brief explanation of what you observe.'''\n\n        result = vlm_call(prompt, [original_final.image_path, modified_final.image_path], temperature=0.1)\n        \n        if \"GOOD\" in result:\n            return result, 1.0\n        elif \"PARTIAL\" in result:\n            return result, 0.5\n        else:\n            return f\"Other changes made to final slide: {result}\", 0.0\n    except Exception as e:\n        return f\"Error analyzing slide images: {str(e)}\", 0.0\n"
            }
          }
        ]
      },
      {
        "name": "Content preservation",
        "description": "Verifies that the text content and overall structure of the final slide were preserved during the bullet format change.",
        "is_critical": false,
        "metadata": {},
        "children": [
          {
            "name": "Text content preserved",
            "description": "Ensures that the actual text content of the bullet points was not altered, only the formatting.",
            "is_critical": true,
            "metadata": {},
            "scorer": {
              "type": "function",
              "function_code": "def compute_score() -> tuple[str, float]:\n    from pptx import Presentation\n    import re\n    \n    try:\n        # Load both presentations\n        original_prs = Presentation(original_ppt_path)\n        modified_prs = Presentation(modified_ppt_path)\n        \n        original_final = original_prs.slides[-1]\n        modified_final = modified_prs.slides[-1]\n        \n        # Extract text content from both slides\n        def extract_text_content(slide):\n            texts = []\n            for shape in slide.shapes:\n                if hasattr(shape, 'text_frame') and shape.text_frame:\n                    for paragraph in shape.text_frame.paragraphs:\n                        text = paragraph.text.strip()\n                        if text:\n                            # Remove bullet characters and numbering for comparison\n                            clean_text = re.sub(r'^[•●○]\\s*|^\\d+\\.\\s*', '', text).strip()\n                            if clean_text:\n                                texts.append(clean_text)\n            return texts\n        \n        original_texts = extract_text_content(original_final)\n        modified_texts = extract_text_content(modified_final)\n        \n        # Compare text content\n        if len(original_texts) != len(modified_texts):\n            return f\"Text count mismatch: {len(original_texts)} vs {len(modified_texts)}\", 0.0\n        \n        matching_texts = 0\n        for orig, mod in zip(original_texts, modified_texts):\n            if orig.lower().strip() == mod.lower().strip():\n                matching_texts += 1\n        \n        if matching_texts == len(original_texts) and len(original_texts) > 0:\n            return \"All text content preserved\", 1.0\n        elif matching_texts > 0:\n            score = matching_texts / len(original_texts)\n            return f\"Partial text preservation: {matching_texts}/{len(original_texts)} texts match\", score\n        else:\n            return \"Text content not preserved\", 0.0\n            \n    except Exception as e:\n        return f\"Error checking text preservation: {str(e)}\", 0.0\n"
            }
          }
        ]
      }
    ]
  },
  "metadata": {
    "task": "Change the circular bullets on the final slide to numbered list items."
  }
}