<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description" content="SVG">
  <meta name="keywords" content="SVG, 3D video, stereoscopic video">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>SVG: 3D Stereoscopic Video Generation via Denoising Frame Matrix</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
<!--   <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');

  </script> -->
  <style>
  .hr {width: 100%; height: 1px; margin: 48px 0; background-color: #d6dbdf;}
  </style>
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
  <link rel="stylesheet" href="./assets/css/bulma.min.css">
  <link rel="stylesheet" href="./assets/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./assets/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./assets/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="./assets/css/index.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  
  <!-- custom css file  -->
  <link rel="stylesheet" href="./assets/css/style.css">
  <link rel="stylesheet" href="./assets/css/twentytwenty.css">
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./assets/js/fontawesome.all.min.js"></script>
</head>
<body>


  <nav class="navbar" role="navigation" aria-label="main navigation" style="position: fixed; top: 0; width: 100%;">
    <div class="navbar-brand">
      <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
      </a>
    </div>
    <div class="navbar-menu">
      <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
        <a class="navbar-item" href="">
          <span class="icon">
              <i class="fas fa-home"></i>
          </span>
          </a>
  
        <div class="navbar-item has-dropdown is-hoverable">
          <a class="navbar-link">
            Table of Contents
          </a>
          <div class="navbar-dropdown">
            <a class="navbar-item" href="#abstract">
              Abstract
            </a>
            <a class="navbar-item" href="#method-overview">
              Method overview
            </a>
            <a class="navbar-item" href="#ablation">
              Ablation study
            </a>
            <a class="navbar-item" href="#comparison">
              Comparison with baselines
            </a>
            <a class="navbar-item" href="#traj_results">
              Video trajectories
            </a>
            <a class="navbar-item" href="#3d-videos">
              3D stereoscopic videos
            </a>
            
          </div>
        </div>
      </div>
  
    </div>
  </nav>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop head">
      <div class="columns is-centered">
        <div class="column has-text-centered head">
          <h1 class="title is-2 publication-title"> 
            SVG: 3D Stereoscopic Video Generation via Denoising Frame Matrix
          </h1>

          <div class="is-size-5 publication-authors">
            <span class="author-block">
              Anonymous authors
            </span>
          </div>

            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- first row -->

<section  class="hero is-light is-small" id="abstract" >
  
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
      </br>
        <h2 class="title is-3 has-text-centered">Abstract</h2>
        <div class="column has-text-justified">
          <p>Generative video models have demonstrated great capability of producing impressive 2D monocular videos, however, the question of generating 3D stereoscopic video 
            remains under-explored. We propose a pose-free and training-free approach for generating 3D stereoscopic videos using an off-the-shelf monocular video generation model. 
            Our method warps a generated monocular video into camera views on stereoscopic baseline using estimated video depth, and employs a novel <i>frame matrix</i> video inpainting framework. 
            Our frame matrix framework leverages the video generation model to inpaint frames observed at different timestamps and from different views. 
            This effective approach generates consistent and semantically coherent stereoscopic videos without scene optimization or model fine-tuning. 
            Moreover, we develop an elaborate update scheme that further improves the quality of video inpainting by alleviating the negative effects 
            propagated from disoccluded areas in the latent space. We validate the efficacy of our proposed method by conducting experiments on videos 
            from various generative models, including <a href="https://openai.com/index/sora/">Sora</a>, <a href="https://lumiere-video.github.io/">Lumiere</a>,
            <a href="https://walt-video-diffusion.github.io/">WALT</a>, and <a href="https://zeroscope.replicate.dev/">Zeroscope</a>. 
            The experiments demonstrate that our method has a significant improvement over previous methods. Code will be released.
          </p>
        </div>
        </div>
      </div>
    </div>
  </div>
</section>




<section class="section" id="method-overview">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-3 has-text-centered">Method Overview</h2>
        <div class="column has-text-justified">
          <image src="./results1/pipeline.png" width="100%"></image>
          <br>
          <br>
          <p>Given a text prompt, our method first uses a video generation model to generate a monocular video, which is warped (using estimated depth) into pre-defined camera views to
            form a frame matrix with disocclusion masks. Then, the disoccluded regions are inpainted by denoising the frame sequences within the frame matrix. After denoising, we select
            the leftmost and therightmost columns and decode them to obtain a 3D stereoscopic video.
            
            Bottom: Details of denoising frame matrix. We initialize the latent matrix as a random noise map. For each noise level, we extend the resampling mechanism [16,26] to alternatively
            denoise <span style="background-color: hwb(12 16% 15%);">temporal (column) sequences</span> and <span style="background-color: #F1CCB1;">spatial (row) sequences</span>  N times. Each time, row or column sequences are denoised and inpainted. By denoising along both spatial and
            temporal directions, we obtain an inpainted latent which can be decoded into temporally smooth and semantically consistent sequences.</p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section" >
  <br>
</section>

<section id="ablation" class="section">
    <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
            <h2 class="title is-3">Ablation study</h2>
        </div>
        
        <center> 
          <p>
            Consistent and smooth inpainting is the cornerstone of converting monocular videos into 3D stereoscopic videos. <br/>
            <br/>Frame matrix helps generate semantically reasonable content and benefits consistency across different views.
            <br/><br/>Disocclusion boundary re-injection helps to reduce negative effects of inpainted pixels, and ensures high-quality generation.
            <!-- <br/><br/>Without handling warping-based artifacts, the results contains isolated points and cracks in the first image. -->
          
          </p>
        </center>
        
  

        <!-- Method Buttons -->
        <div class="columns is-centered has-text-centered">
          <p> <br>
            Here we show the effect of the components we introduce in our system →
          </p>
        </div>
        <div class="buttons has-addons is-centered">
          <button class="button" onclick="updateVideos2('base')">Without depth smoothing</button>
          <button class="button" onclick="updateVideos2('NoFm1')">Without frame matrix 1</button>
          <button class="button" onclick="updateVideos2('NoFm2')">Without frame matrix 2</button>
          <button class="button" onclick="updateVideos2('NoRI')">Without disocclusion boundary re-injection</button>
      </div>
      <div class="columns is-centered">
        <!-- <div class="column has-text-justified" style="flex: 5;  max-width: 50%">
          <p> </p>
        </div> -->
        <div class="column has-text-centered">
            <p style="background-color: #FFD700;"> Ours </p>
        </div>
        <div class="column has-text-centered">
            <p id="currentMethodDisplay2"> Without depth smoothing </p>
        </div>
      </div>
        
        
       
        <div class="columns is-centered ">

            <div class="column is-12">
                <div class="content has-text-justified">
                    <div class="item">

                        <div class="twentytwenty-container" id="ablationTwentyTwentyContainer" data-before-label="Our Method" data-after-label="Ablation" data-orientation="horizontal" ratio="0.5">

                            <div class="video">
                                <video id="methodVideo2" muted autoplay="autoplay" loop="loop" width="100%">
                                    <source src="./results1/ab/v1_depth_smooth.mp4" type="video/mp4">
                                </video> 
                            </div>

                            <div class="video">
                                <video id="gtVideo2" muted autoplay="autoplay" loop="loop" width="100%">
                                    <source src="./results1/ab/v1_depth_no_smooth.mp4" type="video/mp4">
                                </video> 
                            </div>
                        </div>
                        <p class="video-caption">Video ablations (right eye view)</p>
                    </div>
                </div>
            </div>

        </div>
    </div>
</section>

<section id="comparison" class="section">
    <div class="container ">
        <div class="columns is-centered has-text-centered">
            <h2 class="title is-3">Comparison with previous methods</h2>
        </div>
        

        <center>
          <p>
            Compared to previous methods, our approach provides a high level of temporal and semantic consistency.
          </p>
          </center><br>

        <!-- Slider for Data -->
        
        
        
        <!-- <div class="columns is-centered has-text-centered">
          <p>Slide to switch between different examples</p>
        </div> -->

        <div class="columns is-centered has-text-centered">
          <div class="column is-8">
            <div class="content has-text-justified">
                <div class="item">
                    <div class="video">
                        <video id="inputVideo" muted autoplay="autoplay" loop="loop" width="100%">
                            <source src="./results1/comparisons/v1_ours_corgi_left.mp4" type="video/mp4">
                        </video> 
                    </div>
                    <p class="video-caption">Left eye view</p>
                </div>
            </div>
            
          </div>
        </div>

        <!-- Method Buttons -->
        <div class="buttons has-addons is-centered">
          <button class="button" id="edvrButton" onclick="updateVideos1('base')">DynIBaR</button>
          <button class="button" onclick="updateVideos1('e2fgvi')">E2FGVI</button>
          <button class="button" onclick="updateVideos1('propainter')">ProPainter</button>
          <button class="button" onclick="updateVideos1('robdynerf')">RoDynRF</button>
      </div>
      
      <div class="columns is-centered">
          <div class="column has-text-centered">
            <p id="currentMethodDisplay"> DynIBaR </p>
          </div>
          <div class="column has-text-centered">
              <p  style="background-color: #FFD700; max-width: 50%"> Ours </p>
          </div>
          <!-- <div class="column has-text-centered" style="flex: 3;  max-width: 50%">
            <p> </p>
          </div> -->
      </div>

        <div class="columns is-centered ">

            <div class="column is-8">
                <div class="content has-text-justified">
                    <div class="item">

                        <div class="twentytwenty-container" data-orientation="horizontal" ratio="0.5" data-before-label="Previous method">

                            <div class="video">
                                <video id="methodVideo" muted autoplay="autoplay" loop="loop" width="100%">
                                    <source src="./results1/comparisons/v1_dynibar_corgi.mp4" type="video/mp4">
                                </video> 
                            </div>

                            <div class="video">
                                <video id="oursVideo" muted autoplay="autoplay" loop="loop" width="100%">
                                    <source src="./results1/comparisons/v1_ours_corgi.mp4" type="video/mp4">
                                </video> 
                            </div>
                        </div>
                        <p class="video-caption">Right eye view (generated)</p>
                    </div>
                </div>
            </div>



            <!-- <div class="column is-7">
                <div class="content has-text-justified">
                    <div class="item">
                        <div class="video">
                            <video id="gtVideo" muted autoplay="autoplay" loop="loop" width="100%">
                                <source src="./assets/videos/data1-gt.mp4" type="video/mp4">
                            </video> 
                        </div>
                        <p class="video-caption">GT</p>
                    </div>
                </div>
            </div> -->

        </div>
        <hr>
        <center>
          <p>Even in sequences with fast motion, our results show high level of temporal consistency.</p>
        </center>
        
            <!-- 2nd -->
            <div class="columns is-centered has-text-centered">
              <div class="column is-8">
                <div class="content has-text-justified">
                    <div class="item">
                        <div class="video">
                            <video id="inputVideo_r2" muted autoplay="autoplay" loop="loop" width="100%">
                                <source src="./results1/comparisons/v1_ours_knight_left.mp4" type="video/mp4">
                            </video> 
                        </div>
                        <p class="video-caption">Left eye view</p>
                    </div>
                </div>
                
              </div>
            </div>
    
            <!-- Method Buttons -->
            <div class="buttons has-addons is-centered">
              <button class="button" id="edvrButton" onclick="updateVideos_r2('base')">E2FGVI</button>
              <button class="button" onclick="updateVideos_r2('propainter')">ProPainter</button>
              <button class="button" onclick="updateVideos_r2('robdynerf')">RoDynRF</button>
              <button class="button" disabled>DynIBaR (fails)</button>
          </div>
          
          <div class="columns is-centered">
              <div class="column has-text-centered">
                <p id="currentMethodDisplay_r2"> E2FGVI </p>
              </div>
              <div class="column has-text-centered">
                  <p  style="background-color: #FFD700; max-width: 50%"> Ours </p>
              </div>
              <!-- <div class="column has-text-centered" style="flex: 3;  max-width: 50%">
                <p> </p>
              </div> -->
          </div>
    
            <div class="columns is-centered ">
    
                <div class="column is-8">
                    <div class="content has-text-justified">
                        <div class="item">
    
                            <div class="twentytwenty-container" data-orientation="horizontal" ratio="0.5" data-before-label="Previous method">
    
                                <div class="video">
                                    <video id="methodVideo_r2" muted autoplay="autoplay" loop="loop" width="100%">
                                        <source src="./results1/comparisons/v1_e2fgvi_knight.mp4" type="video/mp4">
                                    </video> 
                                </div>
    
                                <div class="video">
                                    <video id="oursVideo_r2" muted autoplay="autoplay" loop="loop" width="100%">
                                        <source src="./results1/comparisons/v1_ours_knight.mp4" type="video/mp4">
                                    </video> 
                                </div>
                            </div>
                            <p class="video-caption">Right eye view (generated)</p>
                        </div>
                    </div>
                </div>
        
    </div>
</section>

<section id="traj_results" class="hero is-light is-small">
  <div class="hero-body">
    <div class="container">


    <div class="columns is-centered has-text-centered">
        <h2 class="title is-3">Generated Video Trajectories</h2>
    </div>    
    

    <div class="columns is-centered">
    <div class="column is-6">
      <div class="video">
        <video id="oursVideo_r2" muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/An_astronaut_in_full_space_suit_riding_a_horse_left.mp4" type="video/mp4">
        </video> 
        <center><p>Reference left view</p></center>
    </div>
    </div>
    </div>
  
       
    <div id="results-carousel" class="carousel results-carousel" data-slides-to-show="2">

      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_ft_cv_case1_time1.mp4" type="video/mp4">
            </video> 
        </div>
      </div>

      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_ft_cv_case1_time2.mp4" type="video/mp4">
            </video> 
        </div>
      </div>

      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_fv_ct_case1_view1.mp4" type="video/mp4">
            </video> 
        </div>
      </div>

      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_fv_ct_case1_view2.mp4" type="video/mp4">
            </video> 
        </div>
      </div>
      </div>
      <center>
        <p>Using our frame matrix method, we are able to generate video trajectories through
          <span style="background-color: #F1CCB1;">space</span> (first two sequences) and
          <span style="background-color: hwb(12 16% 15%);">time</span> (last two sequences), consistent with the <a href="#method-overview">Method Overview</a>.</p>
      </center>

      <!-- 2-->
      <div class="columns is-centered">
        <div class="column is-6">
          <div class="video">
            <video id="oursVideo_r2" muted autoplay="autoplay" loop="loop" width="100%">
                <source src="./results1/multi_trajectory/A_futuristic_plane_flying_over_the_desert_left_view.mp4" type="video/mp4">
            </video> 
            <center><p>Reference left view</p></center>
        </div>
        </div>
        </div>
      <div id="results-carousel" class="carousel results-carousel" data-slides-to-show="2">
      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_ft_cv_case2_time1.mp4" type="video/mp4">
            </video> 
        </div>
      </div>

      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_ft_cv_case2_time2.mp4" type="video/mp4">
            </video> 
        </div>
      </div>

      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_fv_ct_case2_view1.mp4" type="video/mp4">
            </video> 
        </div>
      </div>

      <div class="item">
        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/multi_trajectory/v1_fv_ct_case2_view2.mp4" type="video/mp4">
            </video> 
        </div>
      </div>


      </div>

      <center>
        <p>The frame matrix formulation is flexible and allows us also to extract different trajectories. The sequences still appear consistent, thanks to the row-and-column constraints.</p>
      </center>
  </div>
</section>


<section class="section" id="3d-videos">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-24">
        <h2 class="title is-3 has-text-centered">3D stereoscopic videos</h2>


        <div class="column has-text-centered">
          <p> If you happen to have a VR headset, we encourage you to view these videos in stereo! </p>
        </div>

        <div class="video">
            <video muted autoplay="autoplay" loop="loop" width="100%">
                <source src="./results1/3D_stereoscopic_videos/v1_astronaut_horse.mp4" type="video/mp4">
            </video> 
        </div>

        <div class="video">
          <video muted autoplay="autoplay" loop="loop" width="100%">
              <source src="./results1/3D_stereoscopic_videos/v1_corgi_vlog.mp4" type="video/mp4">
          </video> 
      </div>

      <div class="video">
        <video muted autoplay="autoplay" loop="loop" width="100%">
            <source src="./results1/3D_stereoscopic_videos/v1_dragon.mp4" type="video/mp4">
        </video> 
    </div>

    <div class="video">
      <video muted autoplay="autoplay" loop="loop" width="100%">
          <source src="./results1/3D_stereoscopic_videos/v1_panda.mp4" type="video/mp4">
      </video> 
  </div>

  <div class="video">
    <video muted autoplay="autoplay" loop="loop" width="100%">
        <source src="./results1/3D_stereoscopic_videos/v1_teddy.mp4" type="video/mp4">
    </video> 
  </div>

    <div class="column has-text-centered">
      <p> Videos generated from a real monocular video </p>
    </div>

    <div class="video">
      <video muted autoplay="autoplay" loop="loop" width="100%">
          <source src="./results1/3D_stereoscopic_videos/car_drift.mp4" type="video/mp4">
      </video> 
    </div>

    <div class="video">
      <video muted autoplay="autoplay" loop="loop" width="100%">
          <source src="./results1/3D_stereoscopic_videos/obama.mp4" type="video/mp4">
      </video> 
    </div>

      </div>
    </div>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">

    <!-- <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <div class="columns is-centered"><p>
            We thank <a
            href="https://github.com/nerfies/nerfies.github.io">Nerfies</a> and <a href="https://shangchenzhou.com/projects/upscale-a-video/">Upscale-A-Video</a> for its template.
          </p></div>
        </div>
      </div>
    </div> -->


  </div>
</footer>

  <!-- custom js file  -->
  <!-- <script defer src="./assets/js/fontawesome.all.min.js"></script> -->
  <script src="./assets/js/bulma-carousel.min.js"></script>
  <script src="./assets/js/bulma-slider.min.js"></script>
  <script src="./assets/js/index.js"></script>
  <script src="./assets/js/jquery.event.move.js"></script>
  <script src="./assets/js/jquery.twentytwenty.js"></script>
  <script src="./assets/js/select.js"></script>
  <script>
  $(function(){
    $(".twentytwenty-container").twentytwenty();
    // $(".twentytwenty-container", "#results-carousel").twentytwenty({default_offset_pct: 0.5, ratio: 0.5});
  });
  </script>
  <script>
    $(document).ready(function() {
    var referenceVideo = $('#inputVideo2');
    var specificTwentyTwentyContainer = $('#ablationTwentyTwentyContainer'); // targeting the specific container

    function adjustTwentyTwentySize() {
        var videoWidth = referenceVideo.width();
        var videoHeight = referenceVideo.height();
        // console.log("Setting TwentyTwenty size to:", videoWidth, videoHeight);
        specificTwentyTwentyContainer.width(videoWidth);
        specificTwentyTwentyContainer.height(videoHeight);
        // console.log("Setting done! ", videoWidth, videoHeight);

        // Adjust the size of the videos inside the TwentyTwenty container
        specificTwentyTwentyContainer.find('video').each(function() {
            $(this).width(videoWidth);
            $(this).height(videoHeight);
        });
    }

    // referenceVideo.on('loadedmetadata', adjustTwentyTwentySize);
    referenceVideo.on('loadedmetadata', function() {
      adjustTwentyTwentySize();
    });
    $(window).resize(adjustTwentyTwentySize);
    });
  </script>
  <script>
    function checkSliderValue() {
        var slider = document.getElementById('dataSlider');
        var edvrButton = document.getElementById('edvrButton');
        var sliderValue = slider.value;

        // Update the display of slider value
        document.getElementById('sliderValue').textContent = sliderValue;

        // Enable or disable the EDVR button based on slider value
        edvrButton.disabled = sliderValue > 2;
    }

    // Call the function initially to set the correct state of the button
    checkSliderValue();

  </script>
</body>
</html>
