<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="How Well Does GPT-4o Understand Vision? Evaluating Multimodal Foundation Models on Standard Computer Vision Tasks">
  <meta name="keywords" content="Foundation Model, Frontier Model, Computer Vision,
                                 Multimodal Learning, Multimodal Foundation Models,
                                 GPT-4o, Claude, Gemini, Qwen, Evaluation, Vision Benchmarks,
                                 Multi-modal Learning, Generative Modeling,
                                 Computer Vision, Vision">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>How Well Does GPT-4o Understand Vision? Evaluating Multimodal Foundation Models on Standard Computer Vision Tasks</title>

<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-TZC7ZXBJ1N');
</script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.ico">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css" />

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>

  <!-- Main JS -->
  <script src="./static/js/index.js"></script>
</head>
<body>
  <nav class="navbar is-transparent is-fixed-top glass-overlay" role="navigation" aria-label="main navigation" id="navbar">
    <div class="navbar-brand">
      <a class="navbar-item" href="#">
        Home
      </a>
  
      <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
      </a>
    </div>
  
    <div class="navbar-menu" id="navMenu">
      <div class="navbar-start">
        <a class="navbar-item" href="#about">
          Overview
        </a>

        <div class="navbar-item has-dropdown is-hoverable">
          <a class="navbar-link" href="#method">
            Method
          </a>
  
          <div class="navbar-dropdown is-boxed">
            <a class="navbar-item" href="#object-detection">
              Object Detection
            </a>
            <a class="navbar-item" href="#semantic-segmentation">
              Semantic Segmentation
            </a>
            <a class="navbar-item" href="#grouping">
              Grouping
            </a>
            <a class="navbar-item" href="#depth-prediction">
              Depth Prediction
            </a>
            <a class="navbar-item" href="#normal-prediction">
              Surface Normal Prediction
            </a>
            
          </div>
        </div>

        <!-- <a class="navbar-item" href="#quantitative-results">
          Evaluations
        </a> -->
        <div class="navbar-item has-dropdown is-hoverable">
          <a class="navbar-link" href="#quantitative-results">
            Evaluations
          </a>
  
          <div class="navbar-dropdown is-boxed">
            <a class="navbar-item" href="#visuals">
              Qualitative Results
            </a>
            <a class="navbar-item" href="#quantitative-results">
              Quantitative Results
            </a>
          </div>
        </div>

      </div>
  
    </div>
  </nav>

<section class="hero headline">

  <div class="hero-body">

    <div class="container is-max-desktop headline-container">
      
      <div class="columns is-centered">

        <div class="column has-text-centered">
          <div class="title-container">
            <h1 id="scrollingHead" class="title is-1 publication-title">How Well Does GPT-4o Understand Vision? Evaluating Multimodal Foundation Models on Standard Computer Vision Tasks</h1>
          </div>
        </div>

      </div>


    </div>
    

  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <!-- <img src="./static/images/pull_figure.svg" alt="4o pull figure."/> -->
      <img src="https://storage.googleapis.com/fm-task-evals-website/images/pull_figure.svg" alt="pull figure."/>
      <h2 class="subtitle has-text-centered">
      We benchmark popular multimodal foundation models (MFMs) on standard semantic and geometric computer vision tasks using established datasets. 
      The left part of the figure displays 
      <a href="https://openai.com/index/hello-gpt-4o/" target="_blank">
        <span class="dlink">GPT-4o's</span>
      </a>
      predictions for different tasks, including <b>classification</b>, <b>object detection</b>, <b>semantic segmentation</b>, <b>grouping</b>, 
      <b>depth prediction</b>, and <b>surface normal prediction</b>. The right part of the figure quantifies the performance of MFMs on these tasks and provides 
      comparisons with specialist state-of-the-art vision models for each task&mdash;both directly and under the constraints of our framework (+chain).
      </h2>
    </div>
  </div>
</section>


<section class="section hero is-light">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3" id="about">Summary</h2>
        <div class="content has-text-justified">

          <p>
          Multimodal foundation models, such as GPT-4o, have made remarkable progress recently. However, it is not clear how well these models understand vision in detail, especially when it comes to tasks beyond question answering. 
          In this paper, we <b>benchmark the performance of popular multimodal foundation models</b> 
          (GPT-4o, o4-mini, Gemini 1.5 Pro and Gemini 2.0 Flash, Claude 3.5 Sonnet, Qwen2-VL, Llama 3.2) <b>on standard computer vision tasks</b> (semantic segmentation, object detection, image classification, depth and surface normal prediction) 
          <b>using established datasets</b> (e.g., COCO, ImageNet and its variants, etc). The main challenges in performing this analysis are:
            <ul>
              <li>
                Most models are trained to output text and cannot natively express versatile domains, such as segments or 3D geometry.
              </li>
              <li>
                Many leading models are proprietary and accessible only at an API level, i.e., there is no weight access to adapt them.
              </li>
            </ul>
            We address these challenges by translating standard vision tasks into equivalent text-promptable and API-compatible tasks via prompt chaining to create a standardized benchmarking framework. We observe that:
            <ul>
              <li>
                The models are not close to the state-of-the-art specialist models at any task.
              </li>
              <li>
                They are respectable generalists, which is remarkable as they are presumably trained on primarily image-text-based tasks.
              </li>
              <li>
                They perform semantic tasks notably better than geometric ones.
              </li>
              <li>
                GPT-4o performs the best among non-reasoning models, getting the top position in 4 out of 6 tasks. 
              </li>
              <li>
                Reasoning models, e.g. o3, show improvements in geometric tasks.
              </li>
              <li>
                While prompt chaining techniques affect performance, better models are less sensitive to prompt variations.
              </li>
              <li>
                An analysis of models with native image generation, such as the latest GPT-4o, shows they exhibit failure modes, such as hallucinated objects or misalignment between input and output.
              </li>
            </ul>
          </p>
          
        </div>

      </div>
    </div>
    <!--/ Abstract. -->


    <!-- Overview. -->
    <div class="extended-container columns is-centered has-text-centered">
      <div class="column is-full">
        <h2 class="title is-3">Overview</h2>

        <div class="tile is-ancestor is-centered">

          <div class="tile is-parent">
            <a href="#method" class="tile is-child box asdf">
              <p class="title is-5">Method overview</p>
              <p>
                Discover the prompt chaining techniques used for the various 
                vision tasks.
              </p>
            </a>
          </div>

          <div class="tile is-parent">
            <a href="#visuals" class="tile is-child box">
              <p class="title is-5">Qualitative visuals</p>
              <p>
                Explore various interactive qualitative visuals generated by the 
                MFMs.
              </p>
            </a>
          </div>

          <div class="tile is-parent">
            <a href="#quantitative-results" class="tile is-child box">
              <p class="title is-5">Quantitative results</p>
              <p>
                See how the MFMs stack up against each other, and against
                specialist models on standard vision tasks.
              </p>
            </a>
          </div>

        </div>

      </div>
    </div>
    <!--/ Overview. -->
  </div>

</section>


<!-- Method Description -->
<section class="section">
  <div class="container is-max-desktop">

    <!-- Overview -->
    <div class="columns is-centered">
      <div class="column is-full-width">
        <a class="anchor" id="method"></a>
        <h2 class="title is-3">Prompt Chaining</h2>

        <div class="content has-text-justified">
          <p>
            Prompt chaining is a technique designed to help MFMs break down complex tasks into simpler, manageable
            sub-tasks. We develop prompt chains for all of the tasks we evaluate, namely, object detection, semantic
            segmentation, grouping, depth prediction, and surface normal prediction. To guide the choice of how to
            split each task into sub-tasks, we rely on our early key observation that most MFMs are relatively strong
            at image classification, and therefore try to split each task into multiple classification sub-tasks. 
          </p>
        </div>

<!-- Object Detection -->
        <a class="anchor" id="object-detection"></a>
        <h2 class="title is-4">Object Detection</h2>
        <div class="content has-text-justified">
          <p>
          We divide the task into two stages. First, the model identifies all objects in the image. Then, it localizes each 
          object by recursively zooming in. We divide the image into grid cells and ask the model to check if any part of the 
          object is in each cell. The model discards empty cells, narrowing the search area. By using both coarse and fine grids, 
          we quickly downsample and refine the object's edges, pinpointing its location.
          </p>
        </div>

        <div class="hero-body">

          <div class="field has-addons is-pulled-right" id="play-controls"
               title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <p class="control">
              <button class="button is-small is-rounded" onclick="playPauseVideo('objectVideo')">
                <span class="icon is-small">
                  &nbsp;<i class="fa fa-play"></i>&nbsp;<i class="fa fa-pause"></i>&nbsp;
                </span>
              </button>
            </p>
            <p class="control">
              <button class="button is-small is-rounded" onclick="restartVideo('objectVideo')">
                <span class="icon is-small">
                  <i class="fas fa-redo"></i>
                </span>
                <span>Restart animation</span>
              </button>
            </p>
          </div>

          <video id="objectVideo" height="100%" width="100%" preload="metadata"
                 title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <source src="https://storage.googleapis.com/fm-task-evals-website/videos/object-video.mp4#t=0.1"
                    type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            At each step, we divide the image into a grid of crops and each crop is queried for the presence of the target object 
            (sheep in the figure). Cells without the object are discarded, and the process repeats until the full object is located.
          </h2>
        </div>
<!--/ Object Detection -->

<!-- Semantic Segmentation -->
        <a class="anchor" id="semantic-segmentation"></a>
        <h2 class="title is-4">Semantic Segmentation</h2>
        <div class="content has-text-justified">
          <p>
            In semantic segmentation, the goal is to assign a class label to each pixel in an image. Instead of querying each pixel 
            individually, we group pixels using <a href="https://ieeexplore.ieee.org/document/6205760" target="_blank"><span class="dlink">SLIC</span></a>, removing the need for per-pixel queries. 
            Superpixels segment the image into smaller, homogeneous regions based on features like color and texture. We then classify 
            these superpixels in batches, leveraging the strength of MFMs in image classification. To improve accuracy, we include previous 
            batch predictions in the chain and provide multi-scale crops of each superpixel, which enhances the model's ability to capture fine details.
          </p>
        </div>

        <div class="hero-body">

          <div class="field has-addons is-pulled-right" id="play-controls"
               title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <p class="control">
              <button class="button is-small is-rounded" onclick="playPauseVideo('segVideo')">
                <span class="icon is-small">
                  &nbsp;<i class="fa fa-play"></i>&nbsp;<i class="fa fa-pause"></i>&nbsp;
                </span>
              </button>
            </p>
            <p class="control">
              <button class="button is-small is-rounded" onclick="restartVideo('segVideo')">
                <span class="icon is-small">
                  <i class="fas fa-redo"></i>
                </span>
                <span>Restart animation</span>
              </button>
            </p>
          </div>

          <video id="segVideo" height="100%" width="100%" preload="metadata"
                 title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <source src="https://storage.googleapis.com/fm-task-evals-website/videos/segment-video.mp4#t=0.1"
                    type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            We divide the image into superpixels and create "multi-scale pyramids" of superpixels. The pyramids are then classified using the model sequentially to 
            produce the complete segmentation map. A multi-scale pyramid consists of 3 layers: a crop of the superpixel, some context surrounding the crop, and the full image.
          </h2>
        </div>
<!--/ Semantic Segmentation -->

<!-- Grouping -->
 <a class="anchor" id="grouping"></a>
        <h2 class="title is-4">Grouping</h2>
        <div class="content has-text-justified">
          <p>
            In the grouping task, given an image and a query point, the goal is to find other pixels that belong to the same object or background. Unlike semantic segmentation, 
            this task has no fixed set of classes, making it more challenging. We use superpixels and leverage the MFM's ability to assess visual similarity. Each superpixel 
            acts as a node in a graph, with edges connecting neighboring superpixels. Starting from the query point, the model evaluates adjacent superpixels to determine if 
            they belong to the same object. This process continues, merging relevant superpixels, until no more are added.
          </p>
        </div>

        <div class="hero-body">

          <div class="field has-addons is-pulled-right" id="play-controls"
               title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <p class="control">
              <button class="button is-small is-rounded" onclick="playPauseVideo('groupingVideo')">
                <span class="icon is-small">
                  &nbsp;<i class="fa fa-play"></i>&nbsp;<i class="fa fa-pause"></i>&nbsp;
                </span>
              </button>
            </p>
            <p class="control">
              <button class="button is-small is-rounded" onclick="restartVideo('groupingVideo')">
                <span class="icon is-small">
                  <i class="fas fa-redo"></i>
                </span>
                <span>Restart animation</span>
              </button>
            </p>
          </div>

          <video id="groupingVideo" height="100%" width="100%" preload="metadata"
                 title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <source src="https://storage.googleapis.com/fm-task-evals-website/videos/grouping-video.mp4#t=0.1"
                    type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            Given an image and a query point, we first divide the image into superpixels and select the superpixel that the query point falls into.
            At each step, the model is asked to identify the adjacent superpixels (in batches) that belong to the same object as the one covered by the cluster. 
            The selected superpixels are then merged with the cluster to form the next step's input cluster.
          </h2>
        </div>
<!--/ Grouping -->


<!-- Depth Prediction -->
 <a class="anchor" id="depth-prediction"></a>
        <h2 class="title is-4">Depth Prediction</h2>
        <div class="content has-text-justified">
          <p>
            Predicting 3D depth from a single 2D image is inherently ambiguous, so we perform relative depth prediction by having the model rank 
            different regions of the image based on their distance from the camera. Instead of querying individual pixels, we segment the image into 
            superpixels and sample pairs of superpixels for comparison. The model ranks these pairs by relative depth. These pairwise rankings are then 
            globalized using an objective function that assigns larger values to deeper superpixels. We assume all pixels within a superpixel share the 
            same depth, allowing us to extend superpixel-level rankings to a pixel-wise depth map across the image.
          </p>
        </div>

        <div class="hero-body">

          <div class="field has-addons is-pulled-right" id="play-controls"
               title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <p class="control">
              <button class="button is-small is-rounded" onclick="playPauseVideo('depthVideo')">
                <span class="icon is-small">
                  &nbsp;<i class="fa fa-play"></i>&nbsp;<i class="fa fa-pause"></i>&nbsp;
                </span>
              </button>
            </p>
            <p class="control">
              <button class="button is-small is-rounded" onclick="restartVideo('depthVideo')">
                <span class="icon is-small">
                  <i class="fas fa-redo"></i>
                </span>
                <span>Restart animation</span>
              </button>
            </p>
          </div>

          <video id="depthVideo" height="100%" width="100%" preload="metadata"
                 title="Hint: Right click the video and choose 'Show All Controls' to enable more fine-grained video controls.">
            <source src="https://storage.googleapis.com/fm-task-evals-website/videos/depth-video.mp4#t=0.1"
                    type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            Randomly selected superpixel pairs are compared by the model for relative depth, and the pairwise ranks are globalized to generate a depth map.
          </h2>
        </div>
<!--/ Depth Prediction -->


<!-- Surface normal prediction -->
 <a class="anchor" id="normal-prediction"></a>
        <h2 class="title is-4">Surface Normal Prediction</h2>
        <div class="content has-text-justified">
          <p>
            For surface normal prediction, we use a similar ranking approach as with depth. We select standard basis vectors (right, up, and forward) as reference 
            directions and query the model to compare randomly sampled superpixel pairs based on their alignment with each vector. The pairwise comparisons are then 
            globalized using the same algorithm as for depth, resulting in three separate surface normal maps. As before, we assume uniformity within each superpixel, 
            assigning the same rank to all pixels within a superpixel group.
          </p>
        </div>
<!--/ Surface normal prediction  -->
      </div>
    </div>
  </div>
</section>

<!-- Generative visualizations -->
<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full-width">

        <a class="anchor" id="visuals"></a>
          <h2 class="title is-3">Visual showcase</h2>

            <br/>

            <!--Multitask Predictions-->
            <a class="anchor" id="semantic-editing"></a>
            <h3 class="title is-5">Multitask predictions</h3>
            <div class="content has-text-justified">
              <p>
                Here, we showcase the ability of MFMs to perform multiple tasks on the same image, using prompt chaining. The visuals are generated 
                using GPT-4o, and demonstrate a strong semantic understanding of the images, and a non-trivial but relatively weaker understanding of geometric properties.
              </p>
            </div>
            <div class="bboxprobing-panel">

              <div class="columns is-centered is-mobile is-vcentered">
                <div class="column is-1 has-text-centered"></div>

                <div class="column is-10 has-text-centered">
                  
                  <input class="slider is-fullwidth is-large is-info has-output"
                        id="multitask-slider"
                        step="1" min="0" max="4" value="0" type="range">

                  <button class="button" id="prev-button" onclick="prevMultitaskImage()">
                    <span class="icon">
                      <i class="fas fa-arrow-left"></i>
                    </span>
                    <span>Prev.</span>
                  </button>
                  <button class="button" id="next-button" onclick="nextMultitaskImage()">
                  <span>Next</span>
                  <span class="icon">
                      <i class="fas fa-arrow-right"></i>
                  </span>
                  </button>

                </div>
                <div class="column is-1 has-text-centered"></div>
              </div>
              
              <br/>

              <div class="columns is-mobile">
              <div class="column is-2"></div>
              <div class="column is-4 has-text-centered">
                <div id="multitask-wrapper-rgb" class="rgb-image">
                  <img src="https://storage.googleapis.com/fm-task-evals-website/images/multitask/0.jpg"/>
                </div>
                <h2 class="subtitle">RGB input</h2>
              </div>

              <div class="column is-4 has-text-centered">
                <div id="multitask-pred" class="rgb-image">
                  <img src="https://storage.googleapis.com/fm-task-evals-website/images/multitask/0/0.png"/>
                  </div>
                  <h2 class="subtitle" id="multitask-subtitle">Bounding boxes</h2>
                </div>
              </div>
              <div class="column is-2"></div>

              <p style="text-align:center"><i>
                Hint: Drag the slider to change the task.
                Use the buttons to explore different images.
              </i></p>
              <br/>
              
            </div>
          <!--/ Multitask Predictions -->

          <br/>
          <br/>

          <!-- Comparison with ground truth -->
          <a class="anchor" id="rgb-to-all"></a>
          <h3 class="title is-5">Comparison with ground truth</h3>
          <div class="content has-text-justified">
            <p>
              Here, we provide a visual comparison between the predicted outputs of various models and the ground truth across all tasks.
            </p>
          </div>
            <div class="bboxprobing-panel" style="display: flex; flex-direction: column; align-items: center;">

              <div class="columns is-centered is-mobile is-vcentered">
              <div class="column is-1 has-text-centered"></div>
              <div class="column is-10 has-text-centered">
                <button class="button" id="prev-button" onclick="prevComparisonImage()">
                  <span class="icon">
                    <i class="fas fa-arrow-left"></i>
                  </span>
                  <span>Prev.</span>
                </button>
                <button class="button" id="next-button" onclick="nextComparisonImage()">
                  <span>Next</span>
                  <span class="icon">
                    <i class="fas fa-arrow-right"></i>
                  </span>
                </button>
              </div>
              <div class="column is-1 has-text-centered"></div>
            </div>

              <img id="rgb2allFrame" class="has-ratio is-clipped" src="https://storage.googleapis.com/fm-task-evals-website/images/model_comparison_carousel/comparison_0.webp" style="width: 75%; height: auto; border: none;" allowfullscreen></img>

            <p style="text-align:center"><i>
              Hint: Use the buttons to explore different images.
            </i></p>
            <br/>
          </div>
          <!--/ Comparison with ground truth -->

        <br/>
        <br/>

        <!-- Segment anything-->
          <div class="section">
            <div class="container">
              <a class="anchor" id="cat-image-overlay"></a>
              <h3 class="title is-5">Segment anything</h3>
              <div class="content has-text-justified">
                <p>
                  Hover over the original image to see different overlayed segmentations based on your cursor's position. The masks were generated using GPT-4o. The gray points
                  indicate the query points used to generate the segmentations.
                </p>
              </div>
              
              <div class="sam-panel">
                <div class="columns is-centered is-mobile is-vcentered">
                  <div class="column is-1 has-text-centered"></div>
                  <div class="column is-10 has-text-centered">
                    <button class="button" id="prev-button" onclick="prevSamImage()">
                      <span class="icon">
                        <i class="fas fa-arrow-left"></i>
                      </span>
                      <span>Prev.</span>
                    </button>
                    <button class="button" id="next-button" onclick="nextSamImage()">
                      <span>Next</span>
                      <span class="icon">
                        <i class="fas fa-arrow-right"></i>
                      </span>
                    </button>
                  </div>
                  <div class="column is-1 has-text-centered"></div>
                </div>

                <!-- <figure class="image is-5by4" style="position: relative;"> -->
                  <div class="column is-8 has-text-centered">
                  <!-- Original Image -->
                  <img 
                    id="samImg" 
                    src="https://storage.googleapis.com/fm-task-evals-website/images/all_overlayed_imgs/0.webp" 
                    style="width: 100%; height: auto; z-index: 0;"
                  />
                  </div>
                <br/>

                <p style="text-align:center"><i>
                  Hint: Move your cursor over the image to explore different segmentations. Use the buttons to explore different images.
                </i></p>
                <br/>
              </div>


            </div>
          </div>
          <!--/ Segment anything -->

        <br/>
        <br/>

        <!--Video Predictions-->
            <a class="anchor" id="semantic-editing"></a>
            <h3 class="title is-5">Dynamic predictions</h3>
            <div class="content has-text-justified">
              <p>
                An RGB input is shown on the left and the model's predictions on the right, obtained for each frame of a video. Users can use the slider to navigate through frames, observing how the model changes its predictions over time.
              </p>
            </div>
            <div class="bboxprobing-panel">

              <div class="columns is-centered is-mobile is-vcentered">
                <div class="column is-1 has-text-centered"></div>
                <div class="column is-10 has-text-centered">
                  <input class="slider is-fullwidth is-large is-info has-output"
                        id="video-slider"
                        step="1" min="0" max="19" value="0" type="range">
                  <br>
                  <button class="button" id="prev-button" onclick="prevVideoImage()">
                    <span class="icon">
                      <i class="fas fa-arrow-left"></i>
                    </span>
                    <span>Prev.</span>
                  </button>
                  <button class="button" id="next-button" onclick="nextVideoImage()">
                  <span>Next</span>
                  <span class="icon">
                      <i class="fas fa-arrow-right"></i>
                  </span>
                  </button>
                  <button class="button" id="predictions-play-pause-button" onclick="togglePredictionsPlayPause()">
                    <span class="icon">
                      <i class="fas fa-play"></i>
                    </span>
                    <span>Play</span>
                  </button>
                </div>
                <div class="column is-1 has-text-centered"></div>
              </div>

              <br/>

              <div class="columns is-mobile">
                <div class="column is-2"></div>
                <div class="column is-4 has-text-centered">
                  <div id="video-wrapper-rgb" class="rgb-image">
                    <img src="https://storage.googleapis.com/fm-task-evals-website/images/video_slider/0/rgb_0.webp"/>
                  </div>
                  <h2 class="subtitle">RGB input</h2>
                </div>

                <div class="column is-4 has-text-centered">
                  <div id="video-pred" class="rgb-image">
                    <img src="https://storage.googleapis.com/fm-task-evals-website/images/video_slider/0/0.webp"/>
                    </div>
                    <h2 class="subtitle" id="video-subtitle">Bounding box prediction</h2>
                  </div>
              </div>
              <div class="column is-2"></div>
              
              <p style="text-align:center"><i>
                Hint: Drag the slider to change the frame.
                Use the buttons to explore different tasks.
              </i></p>
              <br/>
            </div>
          <!--/ Video Predictions -->
        
      </div>
    </div>
  </div>
</section>
<!--/ Generative visualizations -->

<!--Reasoning Models-->
<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full-width">
        <a class="anchor" id="reasoning-models"></a>
        <h2 class="title is-3">Reasoning Models</h2>
        <div class="columns is-vcentered">
          <div class="column is-half">
            <div class="content has-text-justified">
              <p>
                We evaluated reasoning models, including o1 and o3, on a smaller subset of our data, using GPT-4o as a baseline. The results, summarized in the spider chart, show that while these models perform comparably to GPT-4o on semantic tasks, they exhibit a stronger performance on geometric tasks. 
              </p>
              <p>
                We also experimented with varying the reasoning effort for o4-mini. While we observed some improvement with medium and high reasoning effort compared to low, the trend was not consistent across all tasks. For a deeper dive into these experiments, please see the paper and the supplementary.
              </p>
            </div>
          </div>
          <div class="column is-half">
            <img src="https://storage.googleapis.com/fm-task-evals-website/images/reasoning-models-spider.svg" alt="Reasoning Models Performance Chart" style="width: 80%;"> 
            <!-- <img src="./static/images/reasoning-models-spider.svg" alt="Reasoning Models Performance Chart" style="width: 80%;"> -->
          </div>
        </div>
      </div>
    </div>
  </div>
</section>
<!--/ Reasoning Models-->

<!--4o ImageGen-->
<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full-width">
        <a class="anchor" id="gpt4o-imagegen"></a>
        <h2 class="title is-3">GPT-4o with Image Generation</h2>
        <div class="content has-text-justified">
          <p>
            Recent updates to GPT-4o allow it to generate dense image outputs instead of just text, which is a promising development for vision tasks. However, these new image generation features exhibit several limitations. Specifically, we observe that generated outputs suffer from spatial misalignments and hallucinations. This presents challenges in directly applying this model to vision tasks, which we leave to future work to address. The figure below highlights some of these failure cases.
          </p>
        </div>
        <div class="has-text-centered">
          <img src="https://storage.googleapis.com/fm-task-evals-website/images/4o-imagegen-preds.webp" alt="Failure cases of GPT-4o with image generation" style="width: 99%;">
          <!-- <img src="./static/images/4o-imagegen-preds.webp" alt="Failure cases of GPT-4o with image generation" style="width: 99%;"> -->
          <p class="is-size-6 has-text-grey"><i>Failure cases of GPT-4o with image generation capability. Despite the model's promising capabilities, limitations remain. Here, we highlight some typical failure modes: <span style="color:blue;">hallucinations</span> (marked in <span style="color:blue;">dotted blue</span>) and <span style="color:green;">inaccurate predictions</span> (marked in <span style="color:green;">dotted green</span>).</i></p>
        </div>
      </div>
    </div>
  </div>
</section>
<!--/4o ImageGen-->


<!--Quantitatives -->
<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered">
      <div class="column is-full-width">
        <a class="anchor" id="quantitative-results"></a>
        <h2 class="title is-3">Quantitative results</h2>

        <div class="content has-text-justified">
          <p>
            Here, we quantitatively explore how different MFMs perform across various visual tasks. As described earlier, 
            we tested GPT-4o, o4-mini, Gemini 1.5 Pro and 2.0 Flash, Claude 3.5 Sonnet, and Qwen2-VL-72B, comparing their capabilities to specialized vision models.
          </p>
        </div>


        <a class="anchor" id="classification-quant"></a>
        <h2 class="title is-4">Classification</h2>
        <div class="content has-text-justified">
          <p>
            Our classification testing revealed interesting results across all datasets. While MFMs didn't quite match the performance of specialized 
            vision models like Model Soups ViT-G and OpenCLIP H, they showed impressive capabilities. GPT-4o emerged as the standout performer, followed 
            by Gemini 2.0 Flash, Gemini 1.5 Pro, Claude 3.5 Sonnet, Qwen2-VL, o4-mini and Llama 3.2. Notably, these models demonstrated good resilience to image corruptions and distribution shifts.
          </p>
        </div>
        
        <div class="table-container">
          <table class="table" align="center">
            <thead>
              <tr>
                <!-- <th rowspan="2" class="has-text-centered">ImageNet</th>
                <th rowspan="2" class="has-text-centered">ImageNet-V2</th> -->
                <th colspan="3" class="has-text-centered"></th>
                <th colspan="2" class="has-text-centered">Corruptions</th>
                <th colspan="2" class="has-text-centered">Domain Shift</th>
              </tr>
              <tr>
                <!-- <th class="has-text-centered">ImageNet</th>
                <th class="has-text-centered">ImageNet-V2</th> -->
                <th class="has-text-centered">Model</th>
                <th class="has-text-centered">ImageNet</th>
                <th class="has-text-centered">ImageNet-V2</th>
                <th class="has-text-centered">2DCC</th>
                <th class="has-text-centered">3DCC</th>
                <th class="has-text-centered">ImageNet-R</th>
                <th class="has-text-centered">ImageNet Sketch</th>
              </tr>
            </thead>
            <tbody>
              <!-- Vision Specialist Rows (shaded red) -->
              <tr class="shaded-red">
                <td class="has-text-centered">Model Soups ViT-G</td>
                <td class="has-text-centered">90.94</td>
                <td class="has-text-centered">84.22</td>
                <td class="has-text-centered">-</td>
                <td class="has-text-centered">-</td>
                <td class="has-text-centered">95.46</td>
                <td class="has-text-centered">74.23</td>
              </tr>
              <tr class="shaded-red">
                <td class="has-text-centered">OpenCLIP H</td>
                <td class="has-text-centered">84.37</td>
                <td class="has-text-centered">78.33</td>
                <td class="has-text-centered">66.96</td>
                <td class="has-text-centered">65.95</td>
                <td class="has-text-centered">93.76</td>
                <td class="has-text-centered">73.24</td>
              </tr>
              
              <!-- MFM Rows (shaded green) -->
              <tr class="shaded-green">
                <td class="has-text-centered">GPT-4o</td>
                <td class="has-text-centered">77.20</td>
                <td class="has-text-centered">71.57</td>
                <td class="has-text-centered">62.46</td>
                <td class="has-text-centered">61.13</td>
                <td class="has-text-centered">84.38</td>
                <td class="has-text-centered">67.30</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">o4-mini</td>
                <td class="has-text-centered">55.90</td>
                <td class="has-text-centered">46.99</td>
                <td class="has-text-centered">37.22</td>
                <td class="has-text-centered">36.68</td>
                <td class="has-text-centered">56.05</td>
                <td class="has-text-centered">45.18</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Gemini 2.0 Flash</td>
                <td class="has-text-centered">74.78</td>
                <td class="has-text-centered">75.79</td>
                <td class="has-text-centered">55.67</td>
                <td class="has-text-centered">56.92</td>
                <td class="has-text-centered">82.05</td>
                <td class="has-text-centered">69.43</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Gemini 1.5 Pro</td>
                <td class="has-text-centered">73.88</td>
                <td class="has-text-centered">69.76</td>
                <td class="has-text-centered">56.14</td>
                <td class="has-text-centered">56.22</td>
                <td class="has-text-centered">71.42</td>
                <td class="has-text-centered">57.15</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Claude 3.5 Sonnet</td>
                <td class="has-text-centered">62.85</td>
                <td class="has-text-centered">54.45</td>
                <td class="has-text-centered">40.76</td>
                <td class="has-text-centered">41.41</td>
                <td class="has-text-centered">70.36</td>
                <td class="has-text-centered">57.42</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Qwen2-VL</td>
                <td class="has-text-centered">55.54</td>
                <td class="has-text-centered">49.39</td>
                <td class="has-text-centered">38.92</td>
                <td class="has-text-centered">36.45</td>
                <td class="has-text-centered">66.31</td>
                <td class="has-text-centered">51.18</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Llama 3.2</td>
                <td class="has-text-centered">49.15</td>
                <td class="has-text-centered">48.21</td>
                <td class="has-text-centered">34.45</td>
                <td class="has-text-centered">34.37</td>
                <td class="has-text-centered">65.05</td>
                <td class="has-text-centered">47.11</td>
              </tr>
            </tbody>
          </table>
        </div>

      <br/>

       <a class="anchor" id="detection-quant"></a>
        <h2 class="title is-4">Object Detection</h2>
        <div class="content has-text-justified">
          <p>
            In object detection tests on a subset of the COCO dataset, we compared the MFMs against specialized vision models like DETR and Co-DETR. While all MFMs performed below 
            these specialized models, GPT-4o achieved the highest performance among the MFMs, significantly outperforming others. Interestingly, 
            even when testing Gemini 1.5 Pro and Qwen2-VL with direct bounding box regression, they still couldn't match GPT-4o's performance with the chain algorithm.
          </p>
        </div>
        
        <div class="table-container">
          <table class="table" align="center">
            <thead>
              <tr>
                <th class="has-text-centered">Baselines</th>
                <th class="has-text-centered">Model</th>
                <th class="has-text-centered">AP<sub>50</sub></th>
                <th class="has-text-centered">AP<sub>75</sub></th>
                <th class="has-text-centered">AP</th>
              </tr>
            </thead>
            <tbody>
              <!-- Vision Specialists Rows (shaded-red) -->
              <tr class="shaded-red">
                <td class="has-text-centered" rowspan="6">Vision Specialists</td>
                <td class="has-text-centered has-text-grey-light">Co-DETR</td>
                <td class="has-text-centered has-text-grey-light">91.30</td>
                <td class="has-text-centered has-text-grey-light">86.17</td>
                <td class="has-text-centered has-text-grey-light">80.23</td>
              </tr>
              <tr class="shaded-red">
                <td class="has-text-centered">Co-DETR + Chain</td>
                <td class="has-text-centered">90.06</td>
                <td class="has-text-centered">52.78</td>
                <td class="has-text-centered">51.54</td>
              </tr>
              <tr class="shaded-red">
                <td class="has-text-centered has-text-grey-light">DETR</td>
                <td class="has-text-centered has-text-grey-light">73.31</td>
                <td class="has-text-centered has-text-grey-light">63.61</td>
                <td class="has-text-centered has-text-grey-light">58.67</td>
              </tr>
              <tr class="shaded-red">
                <td class="has-text-centered">DETR + Chain</td>
                <td class="has-text-centered">72.33</td>
                <td class="has-text-centered">38.36</td>
                <td class="has-text-centered">39.36</td>
              </tr>
              <tr class="shaded-red">
                <td class="has-text-centered has-text-grey-light">4M-21</td>
                <td class="has-text-centered has-text-grey-light">59.54</td>
                <td class="has-text-centered has-text-grey-light">51.57</td>
                <td class="has-text-centered has-text-grey-light">47.71</td>
              </tr>
              <tr class="shaded-red">
                <td class="has-text-centered">4M-21 + Chain</td>
                <td class="has-text-centered">55.46</td>
                <td class="has-text-centered">30.48</td>
                <td class="has-text-centered">30.74</td>
              </tr>

              <!-- MFMs Rows (shaded-green) -->
              <tr class="shaded-green">
                <td class="has-text-centered" rowspan="7">MFMs</td>
                <td class="has-text-centered">GPT-4o</td>
                <td class="has-text-centered">60.62</td>
                <td class="has-text-centered">31.97</td>
                <td class="has-text-centered">31.87</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">o4-mini</td>
                <td class="has-text-centered">42.90</td>
                <td class="has-text-centered">22.18</td>
                <td class="has-text-centered">22.60</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Gemini 2.0 Flash</td>
                <td class="has-text-centered">44.17</td>
                <td class="has-text-centered">15.83</td>
                <td class="has-text-centered">19.85</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Gemini 1.5 Pro</td>
                <td class="has-text-centered">39.75</td>
                <td class="has-text-centered">15.27</td>
                <td class="has-text-centered">18.11</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Claude 3.5 Sonnet</td>
                <td class="has-text-centered">31.69</td>
                <td class="has-text-centered">12.13</td>
                <td class="has-text-centered">14.78</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Qwen2-VL</td>
                <td class="has-text-centered">35.62</td>
                <td class="has-text-centered">12.82</td>
                <td class="has-text-centered">15.27</td>
              </tr>
              <tr class="shaded-green">
                <td class="has-text-centered">Llama 3.2</td>
                <td class="has-text-centered">31.87</td>
                <td class="has-text-centered">8.40</td>
                <td class="has-text-centered">12.83</td>
              </tr>

              <!-- Control Rows (shaded-blue) -->
              <tr class="shaded-blue">
                <td class="has-text-centered" rowspan="3">Control</td>
                <td class="has-text-centered">Oracle + Chain (pred. class)</td>
                <td class="has-text-centered">75.44</td>
                <td class="has-text-centered">41.31</td>
                <td class="has-text-centered">41.56</td>
              </tr>
              <tr class="shaded-blue">
                <td class="has-text-centered">Oracle + Chain (full)</td>
                <td class="has-text-centered">92.18</td>
                <td class="has-text-centered">49.33</td>
                <td class="has-text-centered">50.14</td>
              </tr>
              <tr class="shaded-blue">
                <td class="has-text-centered">Blind guess</td>
                <td class="has-text-centered">&lt;0.01</td>
                <td class="has-text-centered">&lt;0.01</td>
                <td class="has-text-centered">&lt;0.01</td>
              </tr>
            </tbody>
          </table>
        </div>

      <br/>

      <a class="anchor" id="segmentation-quant"></a>
        <h2 class="title is-4">Semantic Segmentation</h2>
        <div class="content has-text-justified">
          <p>
            For semantic segmentation on a subset of COCO, the MFMs achieved notable but not state-of-the-art performance. While they showed promising capabilities, they still fell 
            behind specialized models like OneFormer. 
          </p>
        </div>
      
      <a class="anchor" id="grouping-quant"></a>
        <h2 class="title is-5">Grouping</h2>
        <div class="content has-text-justified">
          <p>
            In our grouping task evaluation, which built upon semantic segmentation, we saw varying levels of success among the MFMs. GPT-4o emerged as the top 
            performer, showing good overall performance, though still not matching the capabilities of the specialized SAM model.
          </p>
        </div>
        <div class="tables-wrapper">
          <!-- First Table -->
          <div class="table-container">
            <table class="table" align="center">
              <caption class="title is-6">Semantic Segmentation Results</caption>
              <thead>
                <tr>
                  <th class="has-text-centered">Baselines</th>
                  <th class="has-text-centered">Model</th>
                  <th class="has-text-centered">mIoU</th>
                  <th class="has-text-centered">Pixel Accuracy</th>
                </tr>
              </thead>
              <tbody>
                <!-- Vision Specialists Rows (shaded-red) -->
                <tr class="shaded-red">
                  <td class="has-text-centered" rowspan="4">Vision Specialists</td>
                  <td class="has-text-centered has-text-grey-light">OneFormer</td>
                  <td class="has-text-centered has-text-grey-light">65.52</td>
                  <td class="has-text-centered has-text-grey-light">83.26</td>
                </tr>
                <tr class="shaded-red">
                  <td class="has-text-centered">OneFormer + Chain</td>
                  <td class="has-text-centered">60.64</td>
                  <td class="has-text-centered">81.69</td>
                </tr>
                <tr class="shaded-red">
                  <td class="has-text-centered has-text-grey-light">4M-21</td>
                  <td class="has-text-centered has-text-grey-light">54.31</td>
                  <td class="has-text-centered has-text-grey-light">79.66</td>
                </tr>
                <tr class="shaded-red">
                  <td class="has-text-centered">4M-21 + Chain</td>
                  <td class="has-text-centered">52.72</td>
                  <td class="has-text-centered">78.59</td>
                </tr>
                

                <!-- MFMs Rows (shaded-green) -->
                <tr class="shaded-green">
                  <td class="has-text-centered" rowspan="7">MFMs</td>
                  <td class="has-text-centered">GPT-4o</td>
                  <td class="has-text-centered">44.89</td>
                  <td class="has-text-centered">68.60</td>
                </tr>
                <tr class="shaded-green">
                  <td class="has-text-centered">o4-mini</td>
                  <td class="has-text-centered">39.19</td>
                  <td class="has-text-centered">64.26</td>
                </tr>
                <tr class="shaded-green">
                  <td class="has-text-centered">Gemini 2.0 Flash</td>
                  <td class="has-text-centered">43.04</td>
                  <td class="has-text-centered">66.15</td>
                </tr>
                <tr class="shaded-green">
                  <td class="has-text-centered">Gemini 1.5 Pro</td>
                  <td class="has-text-centered">40.46</td>
                  <td class="has-text-centered">64.88</td>
                </tr>
                <tr class="shaded-green">
                  <td class="has-text-centered">Claude 3.5 Sonnet</td>
                  <td class="has-text-centered">32.05</td>
                  <td class="has-text-centered">58.41</td>
                </tr>
                <tr class="shaded-green">
                  <td class="has-text-centered">Qwen2-VL</td>
                  <td class="has-text-centered">33.59</td>
                  <td class="has-text-centered">56.36</td>
                </tr>
                <tr class="shaded-green">
                  <td class="has-text-centered">Llama 3.2</td>
                  <td class="has-text-centered">36.63</td>
                  <td class="has-text-centered">59.95</td>
                </tr>

                <!-- Baselines Rows (shaded-blue) -->
                <tr class="shaded-blue">
                  <td class="has-text-centered" rowspan="2">Baselines</td>
                  <td class="has-text-centered">Oracle + Chain</td>
                  <td class="has-text-centered">83.41</td>
                  <td class="has-text-centered">94.68</td>
                </tr>
                <tr class="shaded-blue">
                  <td class="has-text-centered">Blind guess</td>
                  <td class="has-text-centered">0.03</td>
                  <td class="has-text-centered">0.29</td>
                </tr>
              </tbody>
            </table>
          </div>

            <!-- Second Table -->
            <div class="table-container">
              <table class="table" align="center">
                <caption class="title is-6">Grouping Results</caption>
                <thead>
                  <tr>
                    <th class="has-text-centered">Models</th>
                    <th class="has-text-centered">mIoU</th>
                  </tr>
                </thead>
                <tbody>
                  <!-- SAM Rows (shaded-red) -->
                  <tr class="shaded-red">
                    <td class="has-text-centered has-text-grey-light">SAM</td>
                    <td class="has-text-centered has-text-grey-light">80.12</td>
                  </tr>
                  <tr class="shaded-red">
                    <td class="has-text-centered">SAM + Chain</td>
                    <td class="has-text-centered">72.32</td>
                  </tr>

                  <!-- Other Models Rows (shaded-green) -->
                  <tr class="shaded-green">
                    <td class="has-text-centered">GPT-4o</td>
                    <td class="has-text-centered">59.06</td>
                  </tr>
                  <tr class="shaded-green">
                    <td class="has-text-centered">o4-mini</td>
                    <td class="has-text-centered">46.00</td>
                  </tr>
                  <tr class="shaded-green">
                    <td class="has-text-centered">Gemini 2.0 Flash</td>
                    <td class="has-text-centered">55.25</td>
                  </tr>
                  <tr class="shaded-green">
                    <td class="has-text-centered">Gemini 1.5 Pro</td>
                    <td class="has-text-centered">44.13</td>
                  </tr>
                  <tr class="shaded-green">
                    <td class="has-text-centered">Claude 3.5 Sonnet</td>
                    <td class="has-text-centered">41.68</td>
                  </tr>
                  <tr class="shaded-green">
                    <td class="has-text-centered">Qwen2-VL</td>
                    <td class="has-text-centered">21.64</td>
                  </tr>
                  <tr class="shaded-green">
                    <td class="has-text-centered">Llama 3.2</td>
                    <td class="has-text-centered">25.69</td>
                  </tr>

                  <!-- Oracle Row (shaded-blue) -->
                  <tr class="shaded-blue">
                    <td class="has-text-centered">Oracle + Chain</td>
                    <td class="has-text-centered">81.77</td>
                  </tr>
                </tbody>
              </table>
            </div>
          </div>

      <br/>

      <a class="anchor" id="grouping-quant"></a>
        <h2 class="title is-4">Depth prediction</h2>
        <div class="content has-text-justified">
          <p>
            Our depth prediction task on a subset of Hypersim images revealed that while MFMs performed better than random guessing, they still showed significant limitations 
            compared to specialized models like Omnidata. Quantitatively, their geometric abilities appear relatively weaker than their semantic abilities. 
            We evaluated performance using both standard metrics and relative measurements like Spearman  correlation coefficients and pairwise depth comparison 
            accuracy.
          </p>
        </div>
        
      <div class="table-container">
        <table class="table" align="center">
          <thead>
            <tr>
              <th rowspan="2" class="has-text-centered">Baselines</th>
              <th rowspan="2" class="has-text-centered">Method</th>
              <th colspan="5" class="has-text-centered">Higher is better ↑</th>
              <th colspan="1" class="has-text-centered">Lower is better ↓</th>
            </tr>
            <tr>
              <th class="has-text-centered">δ₁</th>
              <th class="has-text-centered">δ₂</th>
              <th class="has-text-centered">δ₃</th>
              <th class="has-text-centered">ρ</th>
              <th class="has-text-centered">Accuracy</th>
              <th class="has-text-centered">AbsRel</th>
            </tr>
          </thead>
          <tbody>
            <!-- Vision Specialists Rows (shaded-red) -->
            <tr class="shaded-red">
              <td rowspan="4" class="has-text-centered">Vision Specialists</td>
              <td class="has-text-centered has-text-grey-light">Omnidata</td>
              <td class="has-text-centered has-text-grey-light">0.768</td>
              <td class="has-text-centered has-text-grey-light">0.867</td>
              <td class="has-text-centered has-text-grey-light">0.911</td>
              <td class="has-text-centered has-text-grey-light">0.95</td>
              <td class="has-text-centered has-text-grey-light">-</td>
              <td class="has-text-centered has-text-grey-light">0.375</td>
            </tr>
            <tr class="shaded-red">
              <td class="has-text-centered">Omnidata + Chain</td>
              <td class="has-text-centered">0.568</td>
              <td class="has-text-centered">0.772</td>
              <td class="has-text-centered">0.864</td>
              <td class="has-text-centered">0.81</td>
              <td class="has-text-centered">93.74</td>
              <td class="has-text-centered">0.528</td>
            </tr>
            <tr class="shaded-red">
              <td class="has-text-centered has-text-grey-light">4M-21</td>
              <td class="has-text-centered has-text-grey-light">0.636</td>
              <td class="has-text-centered has-text-grey-light">0.814</td>
              <td class="has-text-centered has-text-grey-light">0.888</td>
              <td class="has-text-centered has-text-grey-light">0.89</td>
              <td class="has-text-centered has-text-grey-light">-</td>
              <td class="has-text-centered has-text-grey-light">0.406</td>
            </tr>
            <tr class="shaded-red">
              <td class="has-text-centered">4M-21 + Chain</td>
              <td class="has-text-centered">0.565</td>
              <td class="has-text-centered">0.774</td>
              <td class="has-text-centered">0.865</td>
              <td class="has-text-centered">0.81</td>
              <td class="has-text-centered">88.25</td>
              <td class="has-text-centered">0.529</td>
            </tr>
            
            <!-- MFMs Rows (shaded-green) -->
            <tr class="shaded-green">
              <td rowspan="7" class="has-text-centered">MFMs</td>
              <td class="has-text-centered">GPT-4o</td>
              <td class="has-text-centered">0.459</td>
              <td class="has-text-centered">0.712</td>
              <td class="has-text-centered">0.838</td>
              <td class="has-text-centered">0.53</td>
              <td class="has-text-centered">70.59</td>
              <td class="has-text-centered">0.621</td>
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">o4-mini</td>
              <td class="has-text-centered">0.467</td>
              <td class="has-text-centered">0.718</td>
              <td class="has-text-centered">0.841</td>
              <td class="has-text-centered">0.58</td>
              <td class="has-text-centered">74.08</td>
              <td class="has-text-centered">0.595</td>
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Gemini 2.0 Flash</td>
              <td class="has-text-centered">0.461</td>
              <td class="has-text-centered">0.715</td>
              <td class="has-text-centered">0.839</td>
              <td class="has-text-centered">0.59</td>
              <td class="has-text-centered">71.11</td>
              <td class="has-text-centered">0.615</td>
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Gemini 1.5 Pro</td>
              <td class="has-text-centered">0.458</td>
              <td class="has-text-centered">0.709</td>
              <td class="has-text-centered">0.835</td>
              <td class="has-text-centered">0.51</td>
              <td class="has-text-centered">66.78</td>
              <td class="has-text-centered">0.628</td>
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Claude 3.5 Sonnet</td>
              <td class="has-text-centered">0.429</td>
              <td class="has-text-centered">0.693</td>
              <td class="has-text-centered">0.830</td>
              <td class="has-text-centered">0.48</td>
              <td class="has-text-centered">68.09</td>
              <td class="has-text-centered">0.657</td>
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Qwen2-VL</td>
              <td class="has-text-centered">0.432</td>
              <td class="has-text-centered">0.698</td>
              <td class="has-text-centered">0.831</td>
              <td class="has-text-centered">0.41</td>
              <td class="has-text-centered">64.44</td>
              <td class="has-text-centered">0.637</td>
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Llama 3.2</td>
              <td class="has-text-centered">0.458</td>
              <td class="has-text-centered">0.711</td>
              <td class="has-text-centered">0.835</td>
              <td class="has-text-centered">0.53</td>
              <td class="has-text-centered">67.51</td>
              <td class="has-text-centered">0.608</td>
            </tr>
            
            <!-- Control Rows (shaded-blue) -->
            <tr class="shaded-blue">
              <td rowspan="2" class="has-text-centered">Control</td>
              <td class="has-text-centered">Oracle + Chain</td>
              <td class="has-text-centered">0.571</td>
              <td class="has-text-centered">0.774</td>
              <td class="has-text-centered">0.863</td>
              <td class="has-text-centered">0.83</td>
              <td class="has-text-centered">100.0</td>
              <td class="has-text-centered">0.528</td>
            </tr>
            <tr class="shaded-blue">
              <td class="has-text-centered">Blind Guess</td>
              <td class="has-text-centered">0.375</td>
              <td class="has-text-centered">0.628</td>
              <td class="has-text-centered">0.773</td>
              <td class="has-text-centered">0.25</td>
              <td class="has-text-centered">54.24</td>
              <td class="has-text-centered">0.758</td>
            </tr>
          </tbody>
        </table>
      </div>


      <a class="anchor" id="grouping-quant"></a>
        <h2 class="title is-4">Surface normal prediction</h2>
        <div class="content has-text-justified">
          <p>
            The surface normal prediction task revealed some significant limitations in the MFMs' 3D understanding capabilities. Most notably, several models 
            struggled with left-right direction correlation, with some showing negative correlation for certain directions, revealing a systematic bias in their understanding of these directions. These results 
            suggest that MFMs currently have limited 3D visual understanding capabilities.
          </p>
        </div>
        
      <div class="table-container">
        <table class="table" align="center">
          <thead>
            <tr>
              <th class="has-text-centered">Baselines</th>
              <th class="has-text-centered">Method</th>
              <th class="has-text-centered">ρ<sub>x</sub></th>
              <th class="has-text-centered">ρ<sub>y</sub></th>
              <th class="has-text-centered">ρ<sub>z</sub></th>
              <!-- <th class="has-text-centered">Accuracy<sub>x</sub></th>
              <th class="has-text-centered">Accuracy<sub>y</sub></th>
              <th class="has-text-centered">Accuracy<sub>z</sub></th> -->
            </tr>
          </thead>
          <tbody>
            <!-- Vision Specialists Rows (shaded-red) -->
            <tr class="shaded-red">
              <td rowspan="4" class="has-text-centered">Vision Specialists</td>
              <td class="has-text-centered has-text-grey-light">Omnidata</td>
              <td class="has-text-centered has-text-grey-light">0.78</td>
              <td class="has-text-centered has-text-grey-light">0.83</td>
              <td class="has-text-centered has-text-grey-light">0.80</td>
              <!-- <td class="has-text-centered has-text-grey-light">-</td>
              <td class="has-text-centered has-text-grey-light">-</td>
              <td class="has-text-centered has-text-grey-light">-</td> -->
            </tr>
            <tr class="shaded-red">
              <td class="has-text-centered">Omnidata + Chain</td>
              <td class="has-text-centered">0.64</td>
              <td class="has-text-centered">0.70</td>
              <td class="has-text-centered">0.58</td>
              <!-- <td class="has-text-centered">95.14</td>
              <td class="has-text-centered">96.31</td>
              <td class="has-text-centered">94.28</td> -->
            </tr>
            <tr class="shaded-red">
              <td class="has-text-centered has-text-grey-light">4M-21</td>
              <td class="has-text-centered has-text-grey-light">0.71</td>
              <td class="has-text-centered has-text-grey-light">0.74</td>
              <td class="has-text-centered has-text-grey-light">0.65</td>
              <!-- <td class="has-text-centered has-text-grey-light">-</td>
              <td class="has-text-centered has-text-grey-light">-</td>
              <td class="has-text-centered has-text-grey-light">-</td> -->
            </tr>
            <tr class="shaded-red">
              <td class="has-text-centered">4M-21 + Chain</td>
              <td class="has-text-centered">0.65</td>
              <td class="has-text-centered">0.70</td>
              <td class="has-text-centered">0.56</td>
              <!-- <td class="has-text-centered">90.76</td>
              <td class="has-text-centered">92.06</td>
              <td class="has-text-centered">86.24</td> -->
            </tr>
            
            <!-- MFMs Rows (shaded-green) -->
            <tr class="shaded-green">
              <td rowspan="7" class="has-text-centered">MFMs</td>
              <td class="has-text-centered">GPT-4o</td>
              <td class="has-text-centered">-0.14</td>
              <td class="has-text-centered">0.57</td>
              <td class="has-text-centered">0.40</td>
              <!-- <td class="has-text-centered">48.31</td>
              <td class="has-text-centered">75.52</td>
              <td class="has-text-centered">68.53</td> -->
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">o4-mini</td>
              <td class="has-text-centered">0.22</td>
              <td class="has-text-centered">0.61</td>
              <td class="has-text-centered">0.46</td>
              <!-- <td class="has-text-centered">38.14</td>
              <td class="has-text-centered">52.83</td>
              <td class="has-text-centered">49.93</td> -->
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Gemini 2.0 Flash</td>
              <td class="has-text-centered">-0.39</td>
              <td class="has-text-centered">-0.04</td>
              <td class="has-text-centered">0.02</td>
              <!-- <td class="has-text-centered">38.14</td>
              <td class="has-text-centered">52.83</td>
              <td class="has-text-centered">49.93</td> -->
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Gemini 1.5 Pro</td>
              <td class="has-text-centered">-0.17</td>
              <td class="has-text-centered">-0.57</td>
              <td class="has-text-centered">0.04</td>
              <!-- <td class="has-text-centered">43.71</td>
              <td class="has-text-centered">41.24</td>
              <td class="has-text-centered">51.62</td> -->
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Claude 3.5 Sonnet</td>
              <td class="has-text-centered">-0.19</td>
              <td class="has-text-centered">0.61</td>
              <td class="has-text-centered">0.40</td>
              <!-- <td class="has-text-centered">48.16</td>
              <td class="has-text-centered">77.61</td>
              <td class="has-text-centered">66.95</td> -->
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Qwen2-VL</td>
              <td class="has-text-centered">0.09</td>
              <td class="has-text-centered">-0.07</td>
              <td class="has-text-centered">0.02</td>
              <!-- <td class="has-text-centered">50.17</td>
              <td class="has-text-centered">47.25</td>
              <td class="has-text-centered">50.07</td> -->
            </tr>
            <tr class="shaded-green">
              <td class="has-text-centered">Llama 3.2</td>
              <td class="has-text-centered">0.41</td>
              <td class="has-text-centered">-0.42</td>
              <td class="has-text-centered">0.22</td>
              <!-- <td class="has-text-centered">57.58</td>
              <td class="has-text-centered">40.38</td>
              <td class="has-text-centered">56.06</td> -->
            </tr>
            
            <!-- Control Rows (shaded-blue) -->
            <tr class="shaded-blue">
              <td rowspan="2" class="has-text-centered">Control</td>
              <td class="has-text-centered">Oracle + Chain</td>
              <td class="has-text-centered">0.64</td>
              <td class="has-text-centered">0.70</td>
              <td class="has-text-centered">0.60</td>
              <!-- <td class="has-text-centered">100.0</td>
              <td class="has-text-centered">100.0</td>
              <td class="has-text-centered">100.0</td> -->
            </tr>
            <tr class="shaded-blue">
              <td class="has-text-centered">Blind guess</td>
              <td class="has-text-centered">-0.48</td>
              <td class="has-text-centered">-0.61</td>
              <td class="has-text-centered">0.11</td>
              <!-- <td class="has-text-centered">39.70</td>
              <td class="has-text-centered">38.52</td>
              <td class="has-text-centered">53.64</td> -->
            </tr>
          </tbody>
        </table>
      </div>
  
</section>
<!--/ Quantitatives -->



<!--Other Experiments -->
<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered">
      <div class="column is-full-width">
        <a class="anchor" id="other-experiments"></a>
        <h2 class="title is-3">Other Experiments</h2>

        <div class="content has-text-justified">
          <p>
            We defer a detailed discussion of several design choices and experimental results to the main paper. Topics covered include:
            <ul>
              <li>Sensitivity analysis of various prompts.</li>
              <li>Performance on images beyond standard datasets.</li>
              <li>A detailed discussion of the algorithms.</li>
              <li>Insights into the blind guessing baseline.</li>
              <li>Preliminary investigations with GPT-4o Image Generation.</li>
              <li>Cost analysis of prompting.</li>
            </ul>
            Interested readers can find all the details in the paper and supplementary material.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!--/ Other Experiments -->


<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website is based on the 
            <a href="https://github.com/nerfies/nerfies.github.io" target="_blank">Nerfies website template</a>,
            which is licensed under a 
            <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>
