<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Fidelity-Aware Data Composition for Robust Robot Generalization</title>
<link href="style.css" rel="stylesheet" type="text/css">

<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
      rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
<link rel="stylesheet"
      href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="./static/css/index.css">

<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script defer src="./static/js/fontawesome.all.min.js"></script>
<script src="./static/js/bulma-carousel.min.js"></script>
<script src="./static/js/bulma-slider.min.js"></script>
<script src="./static/js/index.js"></script>
</head>

<body>
<div class="page-container">
  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title" style="line-height: 1.3; margin-bottom: 15px">
              Fidelity-Aware Data Composition<br>
              <span style="display: inline-block;">for Robust Robot Generalization</span>
            </h1>
            <div class="publication-authors" style="margin-bottom: 15px">
              <span class="author-block">
                <a>ICLR 2026 Anonymous Submission (Paper ID: 10739)</a>
              </span>
            </div>



            <div class="publication-links">
              <span class="link-block">
                <a href="https://anonymous.4open.science/r/CIFT-code" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                </a>
              </span>

              <span class="link-block">
                <a href="#our_results_container" class="external-link button is-normal is-rounded">
                  <span>Showcases</span>
                </a>
              </span>
              <span class="link-block">
                <a href="#applications_container" class="external-link button is-normal is-rounded">
                  <span>Applications</span>
                </a>
              </span>
              <span class="link-block">
                <a href="#comparison_with_baseline_container" class="external-link button is-normal is-rounded">
                  <span>Comparisons</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

    <p class="mt-2 has-text-grey is-size-7 has-text-centered" style="max-width: 800px; margin-left: auto; margin-right: auto;">
      <em>Note: Faces are blurred for privacy. All clips are accelerated with ratios tailored to each scenario for better demonstration.</em>
    </p>
  <!-- ==================== SHOWCASES ==================== -->
<section class="section" id="our_results_container">
  <div class="container is-max-desktop">
    <h2 class="title is-3">Showcases produced by our Multi-View Video Augmentation</h2>
    <p class="has-text-centered is-size-5"><b>Scene Augmentation in Three Perspectives with Prompt Guidance.</b></p>
    
    <p class="mt-4 has-text-left is-size-6" style="max-width: 800px; margin-left: auto; margin-right: auto;">
      Our <strong>Multi-View Video Augmentation (MVAug)</strong> engine generates high-fidelity scene variations from natural language prompts. For each original video, we show one or more augmented versions—each produced from a distinct prompt—demonstrating MVAug’s ability to alter <em>lighting</em>, <em>color</em>, or <em>background</em> while preserving 3D structure and motion. These <strong>multi-view consistent</strong> augmentations enable richer data composition for robust policy learning.
    </p>

    <!-- Task 1: Scan the product -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Scan the product</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./001_results_scan_the_product/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./001_results_scan_the_product/dusk.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Soft dusk lighting, warm yellow glow.</p>
        </div>
      </div>
    </div>

    <!-- Task 2: Make a sandwich (kitchen) -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Make a sandwich (kitchen)</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./002_results_make_a_sandwich_in_the_kitchen/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./002_results_make_a_sandwich_in_the_kitchen/tangerine_right.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Relight with vibrant tangerine glow emanating from the left side.</p>
        </div>
      </div>
    </div>

    <!-- Task 3: Reach the remote control -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Reach for the remote control</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./003_results_reach_for_the_remote_control/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./003_results_reach_for_the_remote_control/tangerine_right.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Spotlight effect, soft dusk lighting, warm yellow glow, centered illumination.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./003_results_reach_for_the_remote_control/lilac.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Transform the lighting of the photo to be uniformly lit, with a uniform lilac glow.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./003_results_reach_for_the_remote_control/yellow_stage.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Transform the lighting to include blazing yellow stage-like lighting from above.</p>
        </div>
      </div>
    </div>

    <!-- Task 4: Fold the shorts -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Fold the shorts</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./004_results_fold_the_shorts/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./004_results_fold_the_shorts/warm_light.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Soft dusk lighting, warm yellow glow.</p>
        </div>
      </div>
    </div>

    <!-- Task 5: Open the griddle -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Open the griddle</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./005_results_open_the_griddle/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./005_results_open_the_griddle/warm_light.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Spotlight effect from a distant front source, soft dusk lighting, warm yellow glow.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./005_results_open_the_griddle/yellow_stage.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Transform the lighting to include blazing yellow stage-like lighting from above.</p>
        </div>
      </div>
    </div>

    <!-- Task 6: Open the salt jar -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Open the salt jar</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./006_results_open_the_salt_jar/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./006_results_open_the_salt_jar/grass.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Replace the background with green grass.</p>
        </div>
      </div>
    </div>

    <!-- Task 7: Swipe the card -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Swipe the card</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./007_resutls_swipe_the_card/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./007_resutls_swipe_the_card/grass.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Replace the background with green grass.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./007_resutls_swipe_the_card/wood.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Replace the background with brown floor.</p>
        </div>
      </div>
    </div>

    <!-- Task 8: Make a sandwich (wooden table) -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Make a sandwich (wooden table)</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./008_results_make_a_sandwich/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./008_results_make_a_sandwich/cool.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Recolor the plate to a soft pink-blue shade.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./008_results_make_a_sandwich/tangerine_right.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Add warm lighting to the vegetables in the scene.</p>
        </div>
      </div>
    </div>

    <!-- Task 9: Make a sandwich (white table) -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Make a sandwich (white table)</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./009_results_make_a_sandwich/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./009_results_make_a_sandwich/wood.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Replace the background with brown floor.</p>
        </div>
      </div>
    </div>

    <!-- Task 10: Open the oven -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Open the oven</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./010_results_open_the_oven/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./010_results_open_the_oven/cool.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Apply a purple finish to the oven.</p>
        </div>
      </div>
    </div>

    <!-- Task 11: Open the ice maker lid -->
    <div class="box mt-6">
      <h3 class="subtitle is-5">Open the ice maker lid</h3>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Original Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./011_results_open_the_ice_maker_lid/original.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <p class="is-size-6 mb-2">Augmented Video</p>
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./011_results_open_the_ice_maker_lid/cool.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Change the lid of the ice maker to purple.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./011_results_open_the_ice_maker_lid/cyan.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Recolor the lid to a cyan tone.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./011_results_open_the_ice_maker_lid/grass.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Replace the background with green grass.</p>
        </div>
      </div>
      <div class="columns is-centered">
        <div class="column is-full has-text-centered">
          <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
            <source src="./011_results_open_the_ice_maker_lid/tangerine_right.mp4" type="video/mp4">
          </video>
          <p class="is-size-7 has-text-grey is-italic mb-2">Prompt: Add a warm orange-yellow glow inside the ice maker.</p>
        </div>
      </div>
    </div>
  </div>
</section>

  <!-- ==================== APPLICATIONS ==================== -->
  <section class="section" id="applications_container">
    <div class="container is-max-desktop">
      <h2 class="title is-3">Applications</h2>
      <p class="has-text-centered is-size-5"><b>Evaluation on the Physical Robot Platform</b></p>

      <!-- <p class="mt-4 has-text-left is-size-6" style="max-width: 800px; margin-left: auto; margin-right: auto;">
        Building on the generative capabilities demonstrated above, 
        this section evaluates the real-world effectiveness and out-of-distribution (OOD) robustness of policies trained with our CIFT framework across diverse manipulation tasks. 
        We consider two representative scenarios: (1) a long-horizon, precision-sensitive <strong>dual-arm cloth folding task</strong>, and (2) a <strong>robust object grasping task</strong> involving varied object appearances and surface conditions. 
        Both tasks are tested under extensive OOD conditions—including variations in object color/size, table textures (e.g., wooden, leather, cotton-linen), lighting, and initial poses. 
        The successful executions below highlight the policy’s strong generalization. 
        We also include representative failure cases, 
        which we attribute not to policy limitations but to hardware-induced gripper jitter—a key challenge in real-world robotic deployment.
      </p> -->

      <div class="content has-text-left is-size-6" style="max-width: 800px; margin-left: auto; margin-right: auto;">
        
        <p class="mb-4">
          Building on the generative capabilities demonstrated above, this section evaluates the <strong>real-world effectiveness</strong> and <strong>out-of-distribution (OOD) robustness</strong> of policies trained with our <strong>CIFT framework</strong>.
        </p>

        <p class="mb-4">
          We expand our evaluation to <strong>five diverse manipulation scenarios</strong>, categorized into:
          <br>
          (1) <strong>Bimanual tasks</strong> involving deformable objects (Dual-Arm Cloth Folding, Sandwich Making);
          <br>
          (2) <strong>Single-arm tasks</strong> (Single-Arm Robust Grasping, Object Packing, Controlled Pouring).
        </p>

        <p class="mb-4">
          These tasks necessitate handling complex dynamics—ranging from <strong>fluid manipulation</strong> to <strong>tight-tolerance insertion</strong>—under extensive OOD conditions. Crucially, our testing environment includes <strong>unseen object variances</strong>, <strong>lighting shifts</strong>, and <strong>diverse table surface textures</strong> (e.g., wooden, leather, cotton-linen).
        </p>

        <p>
          The successful executions highlighting the policy’s <strong>strong generalization</strong> are presented below. We also discuss representative failure cases, which are primarily attributed to <strong>hardware limitations</strong>, such as gripper jitter, rather than algorithmic deficiencies.
        </p>

      </div>

      <!-- Fold the shirt -->
      <div class="box mt-6">
        <h3 class="subtitle is-5">Task: Dual-Arm Cloth Folding</h3>

        <p class="is-size-7 has-text-grey mb-3" style="max-width: 680px; margin-left: auto; margin-right: auto; text-align: left;">
          The policy must fold a T-shirt using two arms in a coordinated sequence. Challenges include handling different fabric colors, sizes, initial orientations (e.g., inside-out), and table surfaces under varying lighting.
        </p>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Failure (Trained on Original Robot Data Only)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/fold_the_shirt/white_desk_purple_shirt_failure.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">white table, purple shirt</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/fold_the_shirt/white_desk_blue_shirt_failure.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">white table, blue shirt</p>
          </div>
        </div>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Success (Trained with Optimal Ratio Augmented Data)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/fold_the_shirt/brown_desk_purple_shirt_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">wooden table, purple shirt</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/fold_the_shirt/brown_desk_blue_shirt_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">wooden table, blue shirt</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/fold_the_shirt/brown_desk_green_shirt_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">wooden table, green shirt</p>
          </div>
        </div>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/fold_the_shirt/white_desk_purple_shirt_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">white table, purple shirt</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/fold_the_shirt/white_desk_green_shirt_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">white table, green shirt</p>
          </div>
        </div>
      </div>

            <!-- Make a sandwich -->
      <div class="box mt-6">
        <h3 class="subtitle is-5">Task: Dual-Arm Sandwich Making</h3>

        <p class="is-size-7 has-text-grey mb-3" style="max-width: 680px; margin-left: auto; margin-right: auto; text-align: left;">
        Making a sandwich requires precise layering of deformable and slippery components — bread, meat, lettuce — under variable surface friction, lighting, and partial occlusion, without misalignment.
        </p>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Failure (Trained on Original Robot Data Only)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/make_a_sandwich/masked_failure_10x.mp4" type="video/mp4">
            </video>
            <!-- <p class="is-size-7 has-text-grey is-italic mb-2">bottle</p> -->
          </div>
        </div>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Success (Trained with Optimal Ratio Augmented Data)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/make_a_sandwich/success_10x.mp4" type="video/mp4">
            </video>
            <!-- <p class="is-size-7 has-text-grey is-italic mb-2">bottle</p> -->
          </div>
        </div>
      </div>

      <!-- Pick up the toy -->
      <div class="box mt-6">
        <h3 class="subtitle is-5">Task: Single-Arm Robust Grasping</h3>

        <p class="is-size-7 has-text-grey mb-3" style="max-width: 680px; margin-left: auto; margin-right: auto; text-align: left;">
           Beyond grasping, the robot must precisely place the toy into a cylindrical cup whose inner diameter is only slightly larger than the toy and similar in height—requiring sub-centimeter placement accuracy. Tested across diverse table materials and lighting.
        </p>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Failure (Trained on Original Robot Data Only)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pick_up_the_toy/baseline/failure_0.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">white table</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pick_up_the_toy/baseline/failure_1.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">white table</p>
          </div>
        </div>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Success (Trained with Optimal Ratio Augmented Data)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pick_up_the_toy/leather_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">leather-covered table</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pick_up_the_toy/white_desk_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">white table</p>
          </div>
        </div>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pick_up_the_toy/cotton_success.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">gray table</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pick_up_the_toy/cotton_success_1.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">cotton-linen table</p>
          </div>
        </div>
      </div>

       <!-- Pack -->
      <div class="box mt-6">
        <h3 class="subtitle is-5">Task: Single-Arm Object Packing</h3>

        <p class="is-size-7 has-text-grey mb-3" style="max-width: 680px; margin-left: auto; margin-right: auto; text-align: left;">
        Packing objects into a container demands orientation-aware stacking, and stability under real-world variation—texture, lighting, container shape—all without human correction.
        </p>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Failure (Trained on Original Robot Data Only)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pack/masked_bottle_failure_5x.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">bottle</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pack/masked_crisps_failure_5x.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">crisps</p>
          </div>
        </div>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Success (Trained with Optimal Ratio Augmented Data)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pack/masked_bottle_success_1.1x.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">bottle</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pack/masked_crisps_success_1x.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">crisps</p>
          </div>
        </div>
      </div>

       <!-- Pour -->
      <div class="box mt-6">
        <h3 class="subtitle is-5">Task: Single-Arm Controlled Pouring</h3>

        <p class="is-size-7 has-text-grey mb-3" style="max-width: 680px; margin-left: auto; margin-right: auto; text-align: left;">
        The robot performs liquid transfer from a source container to a target vessel without spillage.
        </p>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Failure (Trained on Original Robot Data Only)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pour_water/masked_failure_10x_spillage.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">spilled</p>
          </div>
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pour_water/masked_failure_10x_not_find.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">grasp failed</p>
          </div>
        </div>

        <h4 class="is-size-6 has-text-centered mt-4 mb-2">Success (Trained with Optimal Ratio Augmented Data)</h4>
        <div class="columns is-centered is-multiline">
          <div class="column is-one-third-desktop is-full-tablet has-text-centered">
            <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
              <source src="./012_results_Evaluation_on_the_Physical_Robot_Platform/pour_water/success_5x.mp4" type="video/mp4">
            </video>
            <!-- <p class="is-size-7 has-text-grey is-italic mb-2">bottle</p> -->
          </div>
        </div>
      </div>
      
    </div>
  </section>

  

  <!-- ==================== COMPARISONS ==================== -->
  <section class="section" id="comparison_with_baseline_container">
    <div class="container is-max-desktop">
      <h2 class="title is-3">Comparisons with previous methods</h2>
      <div class="content has-text-left is-size-6" style="max-width: 800px; margin-left: auto; margin-right: auto;">
  
        <p class="mb-4">
          Given the same input video, our method generates a <strong>coherent, multi-view consistent result</strong>.
        </p>

        <p>
          In contrast, existing approaches face significant limitations:
          <br>
          <strong>KlingAI</strong> (the current closed-source SOTA) generates views independently and employs a stitching strategy, often leading to geometric misalignment. 
          <strong>RoboEngine</strong>, as acknowledged in its limitations, <span class="is-italic">"does not handle temporal consistency between frames,"</span> resulting in noticeable background flickering. 
          Meanwhile, <strong>RoboTransfer</strong> relies heavily on depth maps and reference images yet still faces challenges with consistency.
        </p>

      </div>
    </div>
  <div class="table-container" style="max-width: 900px; margin: 2rem auto; overflow-x: auto;">
    <table class="table is-bordered is-striped is-hoverable is-fullwidth" style="text-align: center; vertical-align: middle; font-size: 0.85rem !important; table-layout: auto;">
      <thead>
        <tr style="background-color: #f5f5f5; font-size: 1rem;"> 
          <th style="text-align: center; width: 15%; white-space: nowrap; padding: 14px 8px;">Method</th>
          <th style="text-align: center; padding: 14px 8px;">Input Requirement</th>
          <th style="text-align: center; padding: 14px 8px;">Multi-view Strategy</th>
          <th style="text-align: center; padding: 14px 8px;">Background Stability</th>
          <th style="text-align: center; padding: 14px 8px; white-space: nowrap;">Open Source</th>
          <th style="text-align: center; padding: 14px 8px; white-space: nowrap;">Overall Quality</th>
        </tr>
      </thead>
      <tbody>
        <tr style="background-color: #eef6fc; font-weight: bold; border: 2px solid #dcebf7;">
          <td style="vertical-align: middle; padding: 14px 6px;">Ours</td>
          <td style="vertical-align: middle; padding: 14px 6px; line-height: 1.4;">
              Prompt Only<br>
              <span style="font-weight:normal; font-size:0.85em; color: #666; display: inline-block;">(Reference-Free)</span>
          </td>
          <td style="vertical-align: middle; padding: 14px 6px;">Global Coherence</td>
          <td style="vertical-align: middle; padding: 14px 6px;">✅</td>
          <td style="vertical-align: middle; padding: 14px 6px;">✅</td>
          <td style="vertical-align: middle; padding: 14px 6px; letter-spacing: -1px;">⭐⭐⭐⭐⭐</td>
        </tr>
        
        <tr>
          <td style="vertical-align: middle; padding: 14px 6px; font-weight: 600;">KlingAI</td>
          <td style="vertical-align: middle; padding: 14px 6px; line-height: 1.4;">
              Prompt<br>
              <span style="font-size:0.85em; color: #888;">(Ref Image required)</span>
          </td>
          <td style="vertical-align: middle; padding: 14px 6px; line-height: 1.4;">
              Independent & Stitched<br>
              <span style="font-size:0.85em; color: #888;">(Misaligned)</span>
          </td>
          <td style="vertical-align: middle; padding: 14px 6px;">✅</td>
          <td style="vertical-align: middle; padding: 14px 6px;">❌</td>
          <td style="vertical-align: middle; padding: 14px 6px; letter-spacing: -1px;">⭐⭐⭐⭐</td>
        </tr>
        
        <tr>
          <td style="vertical-align: middle; padding: 14px 6px; font-weight: 600;">RoboEngine</td>
          <td style="vertical-align: middle; padding: 14px 6px; line-height: 1.4;">
              Prompt Only<br>
              <span style="font-size:0.85em; color: #888;">(Independent frames)</span>
          </td>
          <td style="vertical-align: middle; padding: 14px 6px;">Independent & Stitched</td>
          <td style="vertical-align: middle; padding: 14px 6px; line-height: 1.4;">
              ⚠️ Flickering<br>
          </td>
          <td style="vertical-align: middle; padding: 14px 6px;">✅</td>
          <td style="vertical-align: middle; padding: 14px 6px; letter-spacing: -1px;">⭐⭐</td>
        </tr>
        
        <tr>
          <td style="vertical-align: middle; padding: 14px 6px; font-weight: 600;">RoboTransfer</td>
          <td style="vertical-align: middle; padding: 14px 6px; line-height: 1.4;">Prompt + Depth + Ref</td>
          <td style="vertical-align: middle; padding: 14px 6px;">Depth-Guided</td>
          <td style="vertical-align: middle; padding: 14px 6px;">⚠️ Inconsistent</td>
          <td style="vertical-align: middle; padding: 14px 6px;">✅</td>
          <td style="vertical-align: middle; padding: 14px 6px; letter-spacing: -1px;">⭐⭐</td>
        </tr>
      </tbody>
    </table>
  </div>
      <div class="box mt-6">
        <div class="columns is-centered">
          <div class="column is-full has-text-centered">
            <p class="is-size-6 mb-2">Original Video</p>
            <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
              <source src="./013_results_comparison/original_musk.mp4" type="video/mp4">
            </video>
          </div>
        </div>
        <div class="columns is-centered">
          <div class="column is-full has-text-centered">
            <p class="is-size-6 mb-2">Ours</p>
            <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
              <source src="./013_results_comparison/ours_musk.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">Replace the background with green grass.</p>
          </div>
        </div>
        <div class="columns is-centered">
          <div class="column is-full has-text-centered">
            <p class="is-size-6 mb-2">KlingAI</p>
            <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
              <source src="./013_results_comparison/keling/output.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic">Kling requires a refer_img to avoid the hallucinations, while Ours is reference-free. <br> (The face shown here is AI-generated and thus unblurred.)</p>
            <div class="columns is-centered is-multiline">
              <div class="column is-one-third-desktop is-full-tablet has-text-centered">
                <img class="media-border" src="./013_results_comparison/keling/ref_img.png" style="width: 100%; max-width: 340px; aspect-ratio: 16/9; object-fit: cover;">
                <p class="is-size-7 has-text-grey is-italic mb-2">refer_img</p>
              </div>
              <div class="column is-one-third-desktop is-full-tablet has-text-centered">
                <video class="media-border" controls muted loop style="width: 100%; max-width: 340px;aspect-ratio: 16/9; object-fit: cover;">
                  <source src="./013_results_comparison/keling/hallucination.mp4" type="video/mp4">
                </video>
                <p class="is-size-7 has-text-grey is-italic mb-2">without refer_img</p>
              </div>
            </div>
          </div>
        </div>
        <div class="columns is-centered">
          <div class="column is-full has-text-centered">
            <p class="is-size-6 mb-2">RoboEngine</p>
            <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
              <source src="./013_results_comparison/roboengine.mp4" type="video/mp4">
            </video>
          </div>
        </div>
        <div class="columns is-centered">
          <div class="column is-full has-text-centered">
            <p class="is-size-6 mb-2">RoboTransfer</p>
            <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
              <source src="./013_results_comparison/robotransfer_0_musk.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic">refer_img: gray grid tablecloth</p>
          </div>
        </div>
        <div class="columns is-centered">
          <div class="column is-full has-text-centered">
            <p class="is-size-6 mb-2">RoboTransfer</p>
            <video class="media-border" controls muted loop style="width: 100%; max-width: 680px;">
              <source src="./013_results_comparison/robotransfer_1_musk.mp4" type="video/mp4">
            </video>
            <p class="is-size-7 has-text-grey is-italic mb-2">refer_img: our “grass” output, first frame</p>
          </div>
        </div>
      </div>
    </div>
  </section>
</div>


<script>
  const videoContainer = document.querySelector('.video-container');
  const video = document.querySelector('video');
  const hoverImage = document.querySelector('.hover-image');
  const caption = document.querySelector('.caption');

  videoContainer.addEventListener('mouseenter', () => {
      video.pause();
      hoverImage.style.display = 'block';
      caption.style.display = 'block';
  });

  videoContainer.addEventListener('mouseleave', () => {
      video.play();
      hoverImage.style.display = 'none';
      caption.style.display = 'none';
  });

  // for nerfies template
  window.dataLayer = window.dataLayer || [];

  function gtag() {
    dataLayer.push(arguments);
  }

  gtag('js', new Date());

  gtag('config', 'G-PYVRSFMDRL');





  // for carousel
  window.addEventListener('DOMContentLoaded', (event) => {
        const videoWrappers = document.querySelectorAll('.video-wrapper');
      
        videoWrappers.forEach(wrapper => {
          const defaultVideo = wrapper.querySelector('.default-video');
          const aspectRatio = defaultVideo.videoWidth / defaultVideo.videoHeight;
          const height = wrapper.offsetWidth / aspectRatio;
      
          wrapper.style.height = `${height}px`;
      
          wrapper.addEventListener('mouseenter', () => {
            defaultVideo.pause();
            hoverVideo.play();
          });
      
          wrapper.addEventListener('mouseleave', () => {
            defaultVideo.play();
            hoverVideo.pause();
          });
        });
      }); 
      $(document).ready(function() {
        var carouselItems = $('.carousel .item');
        var numItems = carouselItems.length;
        var numVideos = 5;
        var currentIndex = 0;
    
        $('.carousel').on('click', function() {
          currentIndex++;
          if (currentIndex + numVideos <= numItems) {
            carouselItems.removeClass('active');
            carouselItems.slice(currentIndex, currentIndex + numVideos).addClass('active');
          } else {
            currentIndex = 0;
            carouselItems.removeClass('active');
            carouselItems.slice(currentIndex, currentIndex + numVideos).addClass('active');
          }
        });
    
        carouselItems.slice(currentIndex, currentIndex + numVideos).addClass('active');
      });
</script>
