[
    {
        "description": "This metric implements input crossover between proxy 4 and proxy 3, incorporating both gradient and activation information. It computes the traditional SVE of feature maps but adds a parallel computation of gradient entropy, capturing both forward feature richness and backward gradient flows. The dual entropy measures are weighted by layer position and combined through multiplication, favoring models with balanced information flow in both directions.",
        "code": "import torch\nimport torch.nn as nn\nimport numpy as np\n\ndef heuristic_8(model, inputs, targets):\n    # Enable gradients for this operation\n    inputs.requires_grad_(True)\n    \n    # Store activation and gradient information\n    activations = []\n    gradients = []\n    \n    # Register hooks for activations\n    activation_hooks = []\n    def activation_hook(module, input, output):\n        activations.append(output)\n    \n    # Register hooks for gradients\n    gradient_hooks = []\n    def gradient_hook(module, grad_input, grad_output):\n        gradients.append(grad_output[0])\n    \n    # Register hooks on convolutional layers\n    for name, module in model.named_modules():\n        if isinstance(module, nn.Conv2d):\n            activation_hooks.append(module.register_forward_hook(activation_hook))\n            gradient_hooks.append(module.register_backward_hook(gradient_hook))\n    \n    # Forward pass\n    outputs = model(inputs)\n    \n    # Create pseudo-loss and backward\n    if targets is None:\n        # If no targets, use argmax as pseudo-targets\n        _, pseudo_targets = outputs.max(1)\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, pseudo_targets)\n    else:\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, targets)\n    \n    loss.backward()\n    \n    # Clean up hooks\n    for hook in activation_hooks:\n        hook.remove()\n    for hook in gradient_hooks:\n        hook.remove()\n    \n    # Compute dual entropy for each layer\n    dual_entropy_scores = []\n    \n    # We might have fewer gradients than activations\n    min_layers = min(len(activations), len(gradients))\n    \n    for i in range(min_layers):\n        act = activations[i]\n        grad = gradients[i]\n        \n        # Skip if dimensions don't match\n        if act.shape != grad.shape:\n            continue\n            \n        batch_size = act.shape[0]\n        channels = act.shape[1]\n        \n        layer_score = 0\n        for b in range(batch_size):\n            # Activation SVE\n            act_entropy = 0\n            try:\n                act_flat = act[b].view(channels, -1)\n                u, s, v = torch.svd(act_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                act_entropy = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # Gradient SVE\n            grad_entropy = 0\n            try:\n                grad_flat = grad[b].view(channels, -1)\n                u, s, v = torch.svd(grad_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                grad_entropy = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # Combine activation and gradient entropy\n            combined_entropy = act_entropy * (1 + grad_entropy)\n            layer_score += combined_entropy\n        \n        # Average over batch\n        layer_score /= batch_size\n        \n        # Weight by layer depth\n        depth_weight = (i + 1) / min_layers\n        weighted_score = layer_score * depth_weight\n        dual_entropy_scores.append(weighted_score)\n    \n    # Aggregate scores\n    final_score = np.mean(dual_entropy_scores) if dual_entropy_scores else 0\n    \n    # Clean up\n    inputs.requires_grad_(False)\n    \n    return float(final_score)",
        "score": 0.8675845924112295
    },
    {
        "description": "This crossover metric implements aggregation crossover between proxy 2 and proxy 3, combining the weighted average approach with rank-weighted aggregation. It calculates combined activation and gradient entropy scores for each layer, then applies a hybrid aggregation strategy that emphasizes both layer depth and relative importance of each layer's score.",
        "code": "import torch\nimport torch.nn as nn\nimport numpy as np\n\ndef heuristic_4(model, inputs, targets):\n    # Enable gradients\n    inputs.requires_grad_(True)\n    \n    # Store activations and gradients\n    activations = []\n    gradients = []\n    \n    # Register hooks\n    act_hooks = []\n    grad_hooks = []\n    \n    def act_hook(module, input, output):\n        activations.append(output)\n    \n    def grad_hook(module, grad_input, grad_output):\n        gradients.append(grad_output[0])\n    \n    for name, module in model.named_modules():\n        if isinstance(module, nn.Conv2d):\n            act_hooks.append(module.register_forward_hook(act_hook))\n            grad_hooks.append(module.register_backward_hook(grad_hook))\n    \n    # Forward pass\n    outputs = model(inputs)\n    \n    # Backward pass\n    if targets is None:\n        _, pseudo_targets = outputs.max(1)\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, pseudo_targets)\n    else:\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, targets)\n    \n    loss.backward()\n    \n    # Clean up hooks\n    for hook in act_hooks:\n        hook.remove()\n    for hook in grad_hooks:\n        hook.remove()\n    \n    # Compute SVE scores\n    layer_scores = []\n    min_layers = min(len(activations), len(gradients))\n    \n    for i in range(min_layers):\n        act = activations[i]\n        grad = gradients[i]\n        \n        if act.shape != grad.shape:\n            continue\n        \n        batch_size = act.shape[0]\n        channels = act.shape[1]\n        \n        layer_score = 0\n        for b in range(batch_size):\n            # Activation SVE\n            act_sve = 0\n            try:\n                act_flat = act[b].view(channels, -1)\n                u, s, v = torch.svd(act_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                act_sve = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # Gradient SVE\n            grad_sve = 0\n            try:\n                grad_flat = grad[b].view(channels, -1)\n                u, s, v = torch.svd(grad_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                grad_sve = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # From proxy 2: weighted addition\n            alpha = 0.7  # Weight for activation SVE\n            beta = 0.3   # Weight for gradient SVE\n            combined = alpha * act_sve + beta * grad_sve\n            layer_score += combined\n        \n        # Average over batch\n        layer_score /= batch_size\n        \n        # Weight by layer depth (from proxy 2)\n        depth_weight = (i + 1) / min_layers\n        weighted_score = layer_score * depth_weight\n        layer_scores.append(weighted_score)\n    \n    # Aggregation crossover: hybrid strategy\n    if layer_scores:\n        # Depth-weighted average (from proxy 2)\n        weighted_avg = np.mean(layer_scores)\n        \n        # Rank-weighted aggregation (from proxy 3)\n        sorted_indices = np.argsort(layer_scores)\n        num_layers = len(layer_scores)\n        rank_weights = np.arange(1, num_layers + 1) / np.sum(np.arange(1, num_layers + 1))\n        rank_weighted = sum(layer_scores[i] * rank_weights[rank] for rank, i in enumerate(sorted_indices))\n        \n        # Combine both aggregation methods\n        final_score = 0.5 * weighted_avg + 0.5 * rank_weighted\n    else:\n        final_score = 0\n    \n    # Clean up\n    inputs.requires_grad_(False)\n    \n    return float(final_score)",
        "score": 0.8649975456069532
    },
    {
        "description": "This metric implements operation crossover between proxy 2 and proxy 8, replacing the multiplication operation with addition in combining activation and gradient information. It computes SVE for both activations and gradients, then combines them using a weighted sum approach that adapts to the relative magnitudes of each component.",
        "code": "import torch\nimport torch.nn as nn\nimport numpy as np\n\ndef heuristic_3(model, inputs, targets):\n    # Enable gradients\n    inputs.requires_grad_(True)\n    \n    # Store activations and gradients\n    activations = []\n    gradients = []\n    \n    # Register hooks\n    act_hooks = []\n    grad_hooks = []\n    \n    def act_hook(module, input, output):\n        activations.append(output)\n    \n    def grad_hook(module, grad_input, grad_output):\n        gradients.append(grad_output[0])\n    \n    for name, module in model.named_modules():\n        if isinstance(module, nn.Conv2d):\n            act_hooks.append(module.register_forward_hook(act_hook))\n            grad_hooks.append(module.register_backward_hook(grad_hook))\n    \n    # Forward pass\n    outputs = model(inputs)\n    \n    # Backward pass\n    if targets is None:\n        _, pseudo_targets = outputs.max(1)\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, pseudo_targets)\n    else:\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, targets)\n    \n    loss.backward()\n    \n    # Clean up hooks\n    for hook in act_hooks:\n        hook.remove()\n    for hook in grad_hooks:\n        hook.remove()\n    \n    # Compute dual entropy scores\n    layer_scores = []\n    min_layers = min(len(activations), len(gradients))\n    \n    for i in range(min_layers):\n        act = activations[i]\n        grad = gradients[i]\n        \n        if act.shape != grad.shape:\n            continue\n        \n        batch_size = act.shape[0]\n        channels = act.shape[1]\n        \n        layer_score = 0\n        for b in range(batch_size):\n            # Calculate activation SVE\n            act_sve = 0\n            try:\n                act_flat = act[b].view(channels, -1)\n                u, s, v = torch.svd(act_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                act_sve = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # Calculate gradient SVE\n            grad_sve = 0\n            try:\n                grad_flat = grad[b].view(channels, -1)\n                u, s, v = torch.svd(grad_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                grad_sve = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # Operation crossover: weighted addition instead of multiplication\n            alpha = 0.7  # Weight for activation SVE\n            beta = 0.3   # Weight for gradient SVE\n            combined = alpha * act_sve + beta * grad_sve\n            layer_score += combined\n        \n        # Average over batch\n        layer_score /= batch_size\n        \n        # Apply depth weighting\n        depth_weight = (i + 1) / min_layers\n        weighted_score = layer_score * depth_weight\n        layer_scores.append(weighted_score)\n    \n    # Aggregate using rank-weighted method from proxy 5\n    if layer_scores:\n        sorted_indices = np.argsort(layer_scores)\n        num_layers = len(layer_scores)\n        rank_weights = np.arange(1, num_layers + 1) / np.sum(np.arange(1, num_layers + 1))\n        final_score = sum(layer_scores[i] * rank_weights[rank] for rank, i in enumerate(sorted_indices))\n    else:\n        final_score = 0\n    \n    # Clean up\n    inputs.requires_grad_(False)\n    \n    return float(final_score)",
        "score": 0.8318961837274823
    },
    {
        "description": "This metric combines the rank-weighted aggregation from proxy 3 with the dual entropy calculation from proxy 8, implementing both aggregation and operation crossover. It computes both activation and gradient entropy for each layer, combines them using weighted multiplication, then applies rank-weighted aggregation to prioritize the most informative layers rather than using a simple mean.",
        "code": "import torch\nimport torch.nn as nn\nimport numpy as np\n\ndef heuristic_4(model, inputs, targets):\n    # Enable gradients\n    inputs.requires_grad_(True)\n    \n    # Store activations and gradients\n    activations = []\n    gradients = []\n    \n    # Register hooks for activations\n    activation_hooks = []\n    def activation_hook(module, input, output):\n        activations.append(output)\n    \n    # Register hooks for gradients\n    gradient_hooks = []\n    def gradient_hook(module, grad_input, grad_output):\n        gradients.append(grad_output[0])\n    \n    # Register hooks on convolutional layers\n    for name, module in model.named_modules():\n        if isinstance(module, nn.Conv2d):\n            activation_hooks.append(module.register_forward_hook(activation_hook))\n            gradient_hooks.append(module.register_backward_hook(gradient_hook))\n    \n    # Forward pass\n    outputs = model(inputs)\n    \n    # Create loss and backward\n    if targets is None:\n        _, pseudo_targets = outputs.max(1)\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, pseudo_targets)\n    else:\n        criterion = nn.CrossEntropyLoss()\n        loss = criterion(outputs, targets)\n    \n    loss.backward()\n    \n    # Clean up hooks\n    for hook in activation_hooks:\n        hook.remove()\n    for hook in gradient_hooks:\n        hook.remove()\n    \n    # Compute dual entropy scores\n    dual_entropy_scores = []\n    min_layers = min(len(activations), len(gradients))\n    \n    for i in range(min_layers):\n        act = activations[i]\n        grad = gradients[i]\n        \n        # Skip if dimensions don't match\n        if act.shape != grad.shape:\n            continue\n            \n        batch_size = act.shape[0]\n        channels = act.shape[1]\n        \n        layer_score = 0\n        for b in range(batch_size):\n            # Activation SVE\n            act_entropy = 0\n            try:\n                act_flat = act[b].view(channels, -1)\n                u, s, v = torch.svd(act_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                act_entropy = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # Gradient SVE\n            grad_entropy = 0\n            try:\n                grad_flat = grad[b].view(channels, -1)\n                u, s, v = torch.svd(grad_flat)\n                s_norm = s / torch.sum(s)\n                s_norm = s_norm[s_norm > 1e-10]\n                grad_entropy = -torch.sum(s_norm * torch.log(s_norm)).item()\n            except:\n                pass\n            \n            # Combine activation and gradient entropy\n            combined_entropy = act_entropy * (1 + grad_entropy)\n            layer_score += combined_entropy\n        \n        # Average over batch\n        layer_score /= batch_size\n        \n        # Weight by layer depth\n        depth_weight = (i + 1) / min_layers\n        weighted_score = layer_score * depth_weight\n        dual_entropy_scores.append(weighted_score)\n    \n    # Rank-weighted aggregation (from proxy 3)\n    if dual_entropy_scores:\n        # Sort scores and get indices\n        sorted_indices = np.argsort(dual_entropy_scores)\n        # Calculate rank weights (higher ranks get higher weights)\n        num_layers = len(dual_entropy_scores)\n        rank_weights = np.arange(1, num_layers + 1) / sum(range(1, num_layers + 1))\n        # Apply rank weights to scores\n        final_score = sum(dual_entropy_scores[i] * rank_weights[rank] for rank, i in enumerate(sorted_indices))\n    else:\n        final_score = 0\n    \n    # Clean up\n    inputs.requires_grad_(False)\n    \n    return float(final_score)",
        "score": 0.8081618714249124
    },
    {
        "description": "This metric performs an operation crossover by combining SVE analysis with a weighted variance operation. It evaluates both representation diversity through singular value entropy and the statistical spread of feature activations. For each layer, we compute both the SVE and the weighted variance of activations, then combine them through multiplication. The variance component helps capture how widely spread the feature representations are across different spatial locations, while entropy measures information distribution across channels.",
        "code": "import torch\nimport torch.nn as nn\nimport numpy as np\n\ndef heuristic_2(model, inputs, targets):\n    model.eval()\n    feature_maps = []\n    hooks = []\n    \n    # Register hooks to capture feature maps\n    def hook_fn(module, input, output):\n        feature_maps.append(output.detach())\n    \n    for name, module in model.named_modules():\n        if isinstance(module, nn.Conv2d):\n            hooks.append(module.register_forward_hook(hook_fn))\n    \n    # Forward pass\n    with torch.no_grad():\n        _ = model(inputs)\n    \n    # Remove hooks\n    for hook in hooks:\n        hook.remove()\n    \n    # Compute combined SVE and variance for each feature map\n    combined_scores = []\n    for i, fm in enumerate(feature_maps):\n        batch_size, channels, height, width = fm.shape\n        \n        layer_score = 0\n        for b in range(batch_size):\n            # Reshape for SVD calculation\n            fm_reshaped = fm[b].view(channels, -1)\n            \n            # Compute SVD\n            try:\n                u, s, v = torch.svd(fm_reshaped)\n                # Normalize singular values\n                s_norm = s / torch.sum(s)\n                # Calculate entropy, avoiding log(0)\n                s_norm = s_norm[s_norm > 1e-10]\n                entropy = -torch.sum(s_norm * torch.log(s_norm))\n                \n                # Calculate variance of feature map\n                fm_var = torch.var(fm[b].view(-1))\n                \n                # Combine entropy and variance\n                sample_score = entropy.item() * (1 + fm_var.item())\n                layer_score += sample_score\n            except:\n                # Fallback if SVD fails\n                layer_score += 0\n        \n        # Average over batch\n        layer_score /= batch_size\n        # Weight by layer position\n        depth_weight = (i + 1) / len(feature_maps)\n        weighted_score = layer_score * depth_weight\n        combined_scores.append(weighted_score)\n    \n    # Aggregate scores: higher is better\n    final_score = np.mean(combined_scores) if combined_scores else 0\n    \n    return float(final_score)",
        "score": 0.6843572825579084
    }
]