<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="DESCRIPTION META TAG">
  <meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
  <meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/your_banner_image.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>CityNav: Language-Goal Aerial Navigation Dataset with Geographic Information</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <h1 class="title is-2 publication-title">CityNav: Language-Goal Aerial Navigation Dataset<br>with Geographic Information</h1>
        <div class="is-size-5 publication-authors">
          <!-- Paper authors -->
          <span class="author-block">
	    <a>Anonymous</a>
          </span>
        </div>

        <div class="is-size-5 publication-authors">
        </div>

        <div class="column has-text-centered">
          <div class="publication-links">
            <!-- Arxiv PDF link -->
            <span class="link-block">
              <a href="" target="_blank" class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                  <i class="fas fa-file-pdf"></i>
                </span>
                <span>Paper</span>
              </a>
            </span>
                
            &nbsp;&nbsp;&nbsp;&nbsp;

            <!-- Github link -->
            <span class="link-block">
              <a target="_blank"
              class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                  <i class="fab fa-github"></i>
                </span>
              <span>Code</span>
              </a>
            </span>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- Teaser video-->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <div class="content has-text-justified">
          <p><b>TL;DR</b>: CityNav is a dataset for vision-and-language aerial navigation 
            that consists of human-generated trajectories paired with  descriptions on real-world 3D cities.</p>
            <p class="new-line"></p>
          <img src="static/figures/teaser.png" alt="teaser" class="blend-img-background center-image" style="width: 100%; height: auto;">
        </div>
      </div>
    </div>
  </div>
</section>

<hr class="container is-max-desktop separator">

<!-- Paper abstract -->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <h2 class="title is-3">Overview</h2>
        <div class="content has-text-justified">
          <p>
            We introduce CityNav, a new dataset for language-goal aerial navigation using a 3D point cloud representation from the real-world cities. 
            CityNav includes 32,637 natural language descriptions paired with human demonstration trajectories, 
            collected from participants via a new web-based 3D simulator developed for this research. 
            Each description specifies a navigation goal, leveraging the names and locations of landmarks within the real-world cities. 
            We also provide baseline models of navigation agents that incorporate an internal 2D spatial map representing landmarks referenced in the descriptions.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->

<hr class="container is-max-desktop separator">

<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <h2 class="title is-3">Aerial Navigation Task</h2>
        <div class="content has-text-justified">
          <video poster="" id="tree" autoplay controls muted loop>
            <source src="static/videos/demo_movie_citynav.mp4"
            type="video/mp4">
          </video>
          <p class="new-line"></p>
          <p>
            The aerial agent is randomly spawned in the city and must locate the target object 
            corresponding to a given linguistic description, using the agent's first-person view images and geographic information.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<hr class="container is-max-desktop separator">

<!-- demonstration video -->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <h2 class="title is-3">Web-based 3D Flight Simulator</h2>
        <div class="content has-text-justified">
          <video poster="" id="tree" autoplay controls muted loop>
            <source src="static/videos/citynav_web_demo_mid_quality.mp4"
            type="video/mp4">
          </video>
          <p class="new-line"></p>
          <p>
            To collect trajectory data via the web, we developed a web-based flight simulator 
            that allows users to operate an aerial agent within 3D enviroments.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<hr class="container is-max-desktop separator">

<!-- Dataset Statistics -->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <h2 class="title is-3">Dataset Statistics</h2>
        <div class="content has-text-justified">
          <img src="static/figures/data_statistics.png" alt="statistics of our proposed dataset" class="blend-img-background center-image">
          <p class="new-line"></p>
          <p>
            <strong>Dataset Statistics:</strong> (a) summarizes the statistics of the number of scenes and trajectories for each set, 
            (b) illustrates the distributions for the length of collected trajectories. 
            (c) illustrates the distributions for the description length corresponding to the trajectories, 
            (d) shows the distance distribution of eval splits from the starting point to the goal, 
            (e) shows the episode length of both the shortest path and human demonstration trajectories, 
            and (f) shows action histograms for the shortest path and human demonstration.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<hr class="container is-max-desktop separator">

<!-- Method -->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <h2 class="title is-3">Map-based Goal Predictor</h2>
        <div class="content has-text-justified">
          <img src="static/figures/method.png" alt="our proposed method" class="blend-img-background center-image">
          <p class="new-line"></p>
          <p>
            Map-based Goal Predictor (MGP) is our proposed model that combines state-of-the-art off-the-shelf models to perform map-based goal prediction.
            It utilizes navigation maps generated at each time step through the following three steps: (i) target, landmark, and surroundings name extraction by GPT-3.5 Turbo, 
            (ii) object detection and segmentation by GroundingDINO and Mobile-SAM, (iii) optional coordinate refinement by LLaVA-1.6-34b using the set-of-mark prompting.
            A map encoder, using a navigation map that includes a landmark map, view & explore area maps, and target & surroundings maps, is trained alongside the RGB and depth encoders of Cross-Modal Attention.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<hr class="container is-max-desktop separator">


<section class="section hero"></section>
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-full">
        <h2 class="title is-3">Aerial Navigation Results</h2>
      <div class="custom_slider w-slider" data-autoplay="true" data-duration="800">
        <div class="mask w-slider-mask" id="w-slider-mask-0">
          <div class="slide w-slide" aria-label="1 of 9" role="group">
            <div class="div-block-9 first_video">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_1.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>

          <div class="slide w-slide" aria-label="2 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_2.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>

          <div class="slide w-slide" aria-label="3 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_3.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>
          <div class="slide w-slide" aria-label="4 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_4.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>
          <div class="slide w-slide" aria-label="5 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_5.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>
          <div class="slide w-slide" aria-label="6 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_6.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>
          <div class="slide w-slide" aria-label="7 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_7.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>
          <div class="slide w-slide" aria-label="8 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_8.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>
          <div class="slide w-slide" aria-label="9 of 9" role="group">
            <div class="div-block-9">
              <div class="video_class w-embed">
                <video width="100%" height="auto" autoplay muted controls loop preload="metadata">
                  <source src="static/videos/qualitative_9.mp4" type="video/mp4">
                  Your browser does not support the video tag.
                </video>
              </div>
            </div>
          </div>
        </div>
        <div class="arrow left-arrow" onclick="previousSlide()">&#9664;</div>
        <div class="arrow right-arrow" onclick="nextSlide()">&#9654;</div>
      </div>
    </div>
  </div>
  <div class="dots-container is-max-desktop"></div>
</div>
</section>


<!--BibTex citation -->
<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>
      @misc{citynav,
        title={CityNav: Language-Goal Aerial Navigation Dataset with Geographic Information}, 
        author={Anonymous},
        year={2024},
      }
    </code></pre>
  </div>
</section>

<script>
  document.addEventListener('DOMContentLoaded', function () {
    var carousels = bulmaCarousel.attach('.carousel1', {
      autoplay: false, // Disable the automatic page switch
    });
  });
</script>

<script>
  let currentSlide = 0; 
  const slides = document.querySelectorAll('.w-slide'); 
  const totalSlides = slides.length; 

  const sliderMask = document.getElementById('w-slider-mask-0'); 

  function nextSlide() {
    currentSlide = (currentSlide + 1) % totalSlides;
    updateSlidePosition();
  }

  function previousSlide() {
    currentSlide = (currentSlide - 1 + totalSlides) % totalSlides;
    updateSlidePosition();
  }

  function updateSlidePosition() {
    const newTransformValue = `translateX(-${currentSlide * 100}%)`;
    sliderMask.style.transform = newTransformValue; 
    
    updateDots(); 
  }

  function createDots() {
    const dotsContainer = document.querySelector('.dots-container');
    for (let i = 0; i < totalSlides; i++) {
      const dot = document.createElement('div');
      dot.classList.add('dot');
      dot.addEventListener('click', () => {
        currentSlide = i; 
        updateSlidePosition(); 
      });
      dotsContainer.appendChild(dot);
    }
  }

  function updateDots() {
    const dots = document.querySelectorAll('.dot');
    dots.forEach((dot, index) => {
      dot.classList.toggle('active', index === currentSlide);
    });
  }

  createDots();
  updateDots();
  setInterval(() => {
    nextSlide();
  }, 10000); // 10000ms = 10s
</script>


<style>
  .container-wrapper {
    display: flex;
    justify-content: center;
    align-items: center;
    /* min-height: 100vh; */
  }

  .custom-width {
    width: 50%;
  }
</style>

<style>
  .enlarged-image {
    width: 120%; 
    height: 120%; 
  }
</style>

<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>
