<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="PerAct: A Multi-Task Transformer for Robotic Manipulation">
  <meta name="keywords" content="Transformers, Language Grounding, Manipulation">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>Gaze on the Prize</title>
  <script>
    // Define available steps for each task
    var taskSteps = {
      "pickcube": ["2457600", "5734400"],
      "pickcube_clutter": ["1638400", "3276800", "5734400", "7372800"],
      "placesphere": ["4096000", "6963200"],
      "placesphere_clutter": ["4915200", "8192000", "10649600", "18022400"],
      "pokecube": ["4915200", "10240000", "15564800"],
      "pokecube_clutter": ["4505600", "8192000", "13516800", "19251200"],
      "pusht": ["9420800", "18841600", "29081600", "34816000"],
      "pusht_clutter": ["6963200", "12288000", "20480000", "33996800", "40960000", "43008000"]
    };

    function updateStepOptions() {
      var task = document.getElementById("single-menu-tasks").value;
      var stepSelect = document.getElementById("single-menu-demos");
      var currentStep = stepSelect.value;

      // Clear current options
      stepSelect.innerHTML = "";

      // Add new options based on selected task
      var steps = taskSteps[task] || ["0"];
      steps.forEach(function(step, index) {
        var option = document.createElement("option");
        option.value = step;
        option.text = step;
        if (index === 0) option.selected = true;
        stepSelect.appendChild(option);
      });

      // If the previous step exists in new options, keep it selected
      if (steps.includes(currentStep)) {
        stepSelect.value = currentStep;
      }
    }

    function updateSingleVideo() {
      var step = document.getElementById("single-menu-demos").value;
      var task = document.getElementById("single-menu-tasks").value;

      console.log("single", step, task)

      var isclutter = task.includes("_clutter");
      var videoContainer = document.getElementById("video-container");

      // Clear existing content
      videoContainer.innerHTML = "";

      if (isclutter) {
        // For clutter tasks: show nocont and cont side by side
        var videoTypes = ["nocont", "cont"];
        var titles = ["Foveal Attention (ours)", "Foveal Attention (Contrastive, ours)"];

        var html = '<div class="columns is-centered">';
        videoTypes.forEach((type, index) => {
          html += `
            <div class="column is-4">
              <h4 class="subtitle is-6 has-text-centered">${titles[index]}</h4>
              <video muted autoplay loop preload="auto" width="100%">
                <source src="videos/${task}/${step}/${task}_${type}.mp4" type="video/mp4">
              </video>
            </div>
          `;
        });
        html += '</div>';
        videoContainer.innerHTML = html;
      } else {
        // For non-clutter tasks: show patch, nocont, and cont side by side
        var videoTypes = ["patch", "nocont", "cont"];
        var titles = ["Patch Attention", "Foveal Attention (ours)", "Foveal Attention (Contrastive, ours)"];

        var html = '<div class="columns is-centered">';
        videoTypes.forEach((type, index) => {
          html += `
            <div class="column is-4">
              <h4 class="subtitle is-6 has-text-centered">${titles[index]}</h4>
              <video muted autoplay loop preload="auto" width="100%">
                <source src="videos/${task}/${step}/${task}_${type}.mp4" type="video/mp4">
              </video>
            </div>
          `;
        });
        html += '</div>';
        videoContainer.innerHTML = html;
      }

      // Wait for DOM to settle then load and play videos
      setTimeout(() => {
        var videos = videoContainer.querySelectorAll('video');
        videos.forEach(video => {
          video.playbackRate = 1.0;
          // Force load the video
          video.load();
          // Try to play after a short delay
          video.play().catch(e => {
            console.error('Video play failed:', e);
            // Retry play on user interaction if autoplay fails
            video.addEventListener('loadeddata', () => {
              video.play().catch(err => console.error('Retry play failed:', err));
            });
          });
        });
      }, 100);
    }

    function updateQpredVideo() {
      var task = document.getElementById("single-menu-qpred").value;

      console.log("qpred", task)

      var video = document.getElementById("q-pred-video");
      video.src = "media/results/qpred/" + 
                  task + 
                  ".mp4"
      video.playbackRate = 1.75;
      video.play();
    }

  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">
  <link href="https://fonts.googleapis.com/css2?family=Noto+Serif:ital,wght@0,100..900;1,100..900&display=swap" rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  <script>
    document.addEventListener('DOMContentLoaded', function() {
      // Initialize after DOM is fully loaded
      setTimeout(function() {
        updateStepOptions();
        updateSingleVideo();
        if (document.getElementById("q-pred-video")) {
          updateQpredVideo();
        }
      }, 100);
    });
  </script>
</head>
<body>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">Gaze on the Prize: <br>Shaping Visual Attention <br>with Return-Guided Contrastive Learning</h1>
          <h3 class="title is-4 conference-authors">Anonymous Submission</h3>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- <section class="hero teaser">
  <div class="container is-fullhd">
    <div class="hero-body">
      <div class="container">
        <h2 class="subtitle has-text-centered">
        <span class="dgaze">Gaze on the Prize</span> is framework 
        </h2>
      </div>
    </div>
  </div>
</section> -->



<section class="section">
  <div class="container is-max-widescreen">

    <h2 class="title is-3">Videos</h2>
        <div class="columns">
          <div class="column has-text-centered">
            <h3 class="title is-5">Side-by-side Comparison of Attention Methods and Contrastive Learning</h3>

            <div class="select is-small">
              <select id="single-menu-tasks" onchange="updateStepOptions(); updateSingleVideo()">
              <option value="pokecube">PokeCube</option>
              <option value="pokecube_clutter">PokeCube (clutter)</option>
              <option value="pickcube">PickCube</option>
              <option value="pickcube_clutter">PickCube (clutter)</option>
              <option value="placesphere">PlaceSphere</option>
              <option value="placesphere_clutter">PlaceSphere (clutter)</option>
              <option value="pusht">PushT</option>
              <option value="pusht_clutter" selected="selected">PushT (clutter)</option>
              </select>
            </div>
            task, at step
            <div class="select is-small">
              <select id="single-menu-demos" onchange="updateSingleVideo()">
              <option value="6963200">6963200</option>
              <option value="12288000">12288000</option>
              <option value="20480000">20480000</option>
              <option value="33996800">33996800</option>
              <option value="40960000">40960000</option>
              <option value="43008000" selected="selected">43008000</option>
              </select>
            </div>
            <br/>
            <br/>
            <div id="video-container" style="width: 100%; min-height: 400px;">
              <!-- Videos will be loaded here dynamically -->
            </div>
          </div>
        </div>
        </br>
        </br>

    <!-- Animation. -->
    <div class="rows is-centered ">
      <div class="row is-full-width">
        <h2 class="title is-3"><span class="dgaze">Gaze on the Prize</span></h2>

        <!-- Interpolating. -->
        <h3 class="title is-4">A Versatile Framework for learning Human-like Gaze</h3>
        <div class="content has-text-justified">
        <!-- <br> -->
        </div>
        <p>
          <span class="dgaze">Gaze on the Prize</span> is a framework that guides attention to focus on task-relevant visual features in RL through return-guided contrastive learning. By contrasting similar states with different outcomes, our method guides attention toward the features that matter for task success.  
        </p>
        </br>
        </br>
        <img src="static/images/arch.png" class="architecture-image" 
         alt="gaze on the prize architecture" />
        </br>
        </br>
          <p>
              <b>a)</b> A CNN backbone encodes observations into feature maps. Instead of passing them directly to the RL algorithm (baseline), our method refines them with a gaze module that predicts Gaussian attention weights parameterized by μ<sub>x</sub>, μ<sub>y</sub>, σ<sub>x</sub>, σ<sub>y</sub>, σ<sub>xy</sub>. Multiplying features by these weights (&odot;) creates a human-like, foveated representation for the RL algorithm. <b>b)</b> During training, we store CNN features and returns in a buffer. Triplet mining groups together similar features that yield different returns. <b>c)</b> The attention is applied on each triplet and a contrastive loss on cosine distances (anchor z<sub>a</sub>, positive z<sub>&plus;</sub>, negative z<sub>&minus;</sub>) guides the module to adjust its attention to better distinguish features by reward.
          </p>
        </br>
        </br>
        <h3 class="title is-4">Q. How do different attention mechanisms affect visual RL performance?</h3>
          <p class="justify">
            <img src="static/images/rq1.png" class="rq1-image"
                 alt="RQ1 figure." />
             Our foveal attention appears to provide essential regularization, preventing these failure modes while maintaining flexibility to focus on task-relevant regions. However for patch attention, as it lacks structural constraints, attention may focus on misleading features or shift too rapidly during training which leads to unstable training.
          </p>
        <br/>
        <br/>

        <h3 class="title is-4">Q. Does return-guided contrastive learning enhance attention?</h3>
          <p class="justify">
            <img src="static/images/rq2.png" class="rq2-image"
                 alt="RQ2 figure." />
             For challenging tasks, contrastive learning provides stronger impact, where for <code>PushT</code>, contrastive learning provides a 1.48x improvement in sample efficiency to reach 50% success, and for <code>LiftPegUpright</code>, only the contrastive variant reaches 50% success within reasonable steps. Notably, <code>PokeCube</code> shows the highest improvement, with 2.4x better sample efficiency compared to the baseline.
          </p>
        <br/>
        <br/>

        <h3 class="title is-4">Q. Does contrastive learning improve robustness to visual clutter?</h3>
          <p class="justify">
            <img src="static/images/rq3.png" class="rq3-image"
                 alt="RQ3 figure." />
             The performance gap is more apparent. For example, while foveal attention without contrastive learning is unable to solve the <code>PushTClutter</code> task, even underperforming the baseline, contrastive learning provides the necessary guidance to find critical cues from the cluttered environment. Also for <code>PokeCubeClutter</code>, the foveal attention with contrastive learning reaches a robust performance of ~90% success rate, while without contrastive learning, the success rate plateaus below 80%.
          </p>
        <br/>
        <br/>

        <h3 class="title is-4">Q. Is our approach applicable across different RL algorithms?</h3>
          <p class="justify">
            <img src="static/images/rq4.png" class="rq4-image"
                 alt="RQ4 figure." />
             We evaluate our method with off-policy SAC (Soft-Actor-Critic) on five Maniskill3 tasks. We observe improvements over the baseline, either faster convergence or higher final success rates. The trend is similar to that of PPO, demonstrating that our approach is not tied to a single RL algorithm, but can be applied to different RL methods without heavy modifications.
          </p>
        <br/>
        <br/>
    </div>
        <div class="rows">


        <!--/ Re-rendering. -->

        
    </div>
  </div>
</section>
<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column">
        <div class="content has-text-centered">
          <p>
            Template borrowed from <a href="https://github.com/peract/peract.github.io">PerAct</a>. 
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>


</body>
</html>
