<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="DESCRIPTION META TAG">
  <meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
  <meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/images/Pipeline_latest3.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Static Page of SingRef6D</title>
  <link rel="icon" type="image/x-icon" href="static/images/myicon.png">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <!-- <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> -->
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
  <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">SingRef6D: Monocular Novel Object Pose Estimation with a Single RGB Reference</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                Anonymous Submission</a><sup></sup></span>
                <!-- <span class="author-block">
                  <a href="SECOND AUTHOR PERSONAL LINK" target="_blank">Second Author</a><sup>*</sup>,</span>
                  <span class="author-block">
                    <a href="THIRD AUTHOR PERSONAL LINK" target="_blank">Third Author</a>
                  </span> -->
                  </div>

                  <!-- <div class="is-size-5 publication-authors">
                    <span class="author-block">Institution Name<br>Conferance name and year</span>
                    <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span>
                  </div> -->

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                      <!-- <span class="link-block">
                        <a href="https://arxiv.org/pdf/<ARXIV PAPER ID>.pdf" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span> -->

                    <!-- Supplementary PDF link -->
                    <!-- <span class="link-block">
                      <a href="supplementary_material.pdf" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        <i class="fas fa-file-pdf"></i>
                      </span>
                      <span>Supplementary PDF</span>
                    </a>
                  </span> -->

                  <!-- Github link -->
                  <!-- <span class="link-block">
                    <a href="https://openreview.net/" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <img src="static/images/openreveiwicon.png"></i>
                    </span>
                    <span>Back to OpenReview</span>
                  </a>
                </span> -->

                <!-- ArXiv abstract Link -->
                <!-- <span class="link-block">
                  <a href="https://arxiv.org/abs/<ARXIV PAPER ID>" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span> -->
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Banner Image, Adjustable size -->
<!-- <section class="hero is-small" style="margin-top: -50px;">
  <div class="hero-body">
    <div class="container">
      <img src="/podd/static/images/cover.png" alt="Pipeline" class="center-image" style="max-width: 70%; height: auto;">
      <h2 class="subtitle">
        Visualized pipeline for inference (\(\textbf{bottom}\)), the details of our depth model, matching process, and highlights (\(\textbf{top}\)). During inference, a pre-trained dinov2 RGB encoder is utilized to extract the multi-stage features. A fine-tuned fusion network is then adopted to fuse the features in a top-to-bottom manner. Subsequently, an enhanced LoFTR mather takes the RGB images and predicted depth maps as inputs and outputs correspondence by conducting feature-level matching. Then, the relative pose \(\mathbf{T}_{q\rightarrow r}\) can be solved with a point cloud registration model. Finally, the 6D pose for the query object can be calculated with \(\mathbf{T}_{q}^{-1} = \mathbf{T}_{r}^{-1} \mathbf{T}_{q\rightarrow r}\). 
      </h2>
    </div>
  </div>
</section> -->
<!-- End Banner Image -->

<!-- Teaser video-->
<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video poster="" id="tree" autoplay controls muted loop height="80%">
        <!-- Your video here -->
        <source src="static/videos/teaser.mp4"
        type="video/mp4">
      </video>
    </div>
  </div>
</section>
<!-- End teaser video -->

<!-- Teaser video-->
<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img src="static/images/Pipeline.png" alt="Pipeline" class="center-image" style="max-width: 100%; height: auto;">
      <h2 class="subtitle has-text-centered">
        Visualized pipeline for inference (<b>top</b>), the details of our depth model, matching process, and highlights (<b>bottom</b>). 
        During inference, our fine-tuned depth model first estimates the metric depth accurately, which can deal with challenging surfaces. 
        Subsequently, the proposed depth-aware matching utilizes depth value as spatial cues to establish correspondences even in low-textured regions. 
        Then, the relative pose \(\mathbf{T}_{q\rightarrow r}\) can be solved with a point cloud registration model. 
        Finally, the 6D pose for the query object can be calculated with \(\mathbf{T}_{q}^{-1} = \mathbf{T}_{r}^{-1} \mathbf{T}_{q\rightarrow r}\). 
      </h2>
    </div>
  </div>
</section>
<!-- End teaser video -->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Recent 6D pose estimation methods demonstrate notable performance but still face some practical limitations. 
            For instance, many of them rely heavily on sensor depth, which may fail with challenging surface conditions, such as transparent or highly reflective materials. 
            In the meantime, RGB-based solutions provide less robust matching performance in low-light and texture-less scenes due to the lack of geometry information. 
            Motivated by these, we propose <b>SingRef6D</b>, a lightweight pipeline requiring only a <b>single RGB</b> image as a reference, 
            eliminating the need for costly depth sensors, multi-view image acquisition, or training view synthesis models and neural fields. 
            This enables SingRef6D to remain robust and capable even under resource-limited settings where depth or dense templates are unavailable.
            Our framework incorporates two key innovations. 
            First, we propose a token-scaler-based fine-tuning mechanism with a novel optimization loss on top of Depth-Anything v2 to enhance its ability to predict accurate depth, 
            even for challenging surfaces. Our results show a 14.41% improvement (in \(\delta_{1.05}\)) on 
            REAL275 depth prediction compared to Depth-Anything v2 (with fine-tuned head). Second, benefiting from depth availability, 
            we introduce a depth-aware matching process that effectively integrates spatial relationships within LoFTR, 
            enabling our system to handle matching for challenging materials and lighting conditions. 
            Evaluations of pose estimation on the REAL275, ClearPose, and Toyota-Light datasets show that our approach surpasses state-of-the-art methods, 
            achieving a 6.1% improvement in average recall.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!-- Image carousel -->
<!-- <section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <div id="results-carousel" class="carousel results-carousel">
       <div class="item">
        <img src="static/images/carousel1.jpg" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
          First image description.
        </h2>
      </div>
      <div class="item">
        <img src="static/images/carousel2.jpg" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
          Second image description.
        </h2>
      </div>
      <div class="item">
        <img src="static/images/carousel3.jpg" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
         Third image description.
       </h2>
     </div>
     <div class="item">
      <img src="static/images/carousel4.jpg" alt="MY ALT TEXT"/>
      <h2 class="subtitle has-text-centered">
        Fourth image description.
      </h2>
    </div>
  </div>
</div>
</div>
</section> -->
<!-- End image carousel -->




<!-- Youtube video -->

<!-- End youtube video -->



<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">Demo Video of Depth Prediction</h2>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_combined_light2.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The depth prediction result in an artificial light environment (consumer level depth sensor: iPhone 12 ProMax).
          </h2>
        </div>
        <div class="item item-video2">
          <video poster="" id="video2" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_combined_light1.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The depth prediction result in an natural light environment (consumer level depth sensor: iPhone 12 ProMax).
          </h2>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_combined_depth_real275.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The depth prediction result of scenes in the REAL275 dataset (industrial level depth sensor: RealSense).
          </h2>
        </div>
        <div class="item item-video4">
          <video poster="" id="video4" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_combined_depth_clearpose.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The depth prediction result of scenes in the ClearPose dataset (industrial level depth sensor: RealSense).
          </h2>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">Demo Video of Pose Estimation</h2>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_combined_pose_light2.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The pose estimation result in an artificial light environment (consumer level depth sensor: iPhone 12 ProMax).
          </h2>
        </div>
        <div class="item item-video2">
          <video poster="" id="video2" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_pose_l1.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The pose estimation result in an natural light environment (consumer level depth sensor: iPhone 12 ProMax).
          </h2>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_combined_pose_real275.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The pose estimation result of scenes in the REAL275 dataset (industrial level depth sensor: RealSense).
          </h2>
        </div>
        <div class="item item-video4">
          <video poster="" id="video4" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/output_combined_pose_clearpose.mp4"
            type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">
            The pose estimation result of scenes in the ClearPose dataset (industrial level depth sensor: RealSense).
          </h2>
        </div>
      </div>
    </div>
  </div>
</section>




<!-- Paper poster -->
<!-- <section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <h2 class="title">Combined PDF (Main Body and Supplementary Material)</h2>

      <iframe  src="static/pdfs/combine.pdf" width="100%" height="550">
          </iframe>
        
      </div>
    </div>
  </section> -->
<!--End paper poster -->


<!--BibTex citation -->

<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the source code of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>
