<!DOCTYPE html>
<html>
<head>
  
  <title>StreamDiT Supplementary Materials</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="./static/js/google.js"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>
  <script type="text/javascript" async
  src="./static/js/tex-mml-chtml.js">
  </script>

  <link href="./static/css/fonts.css"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="./static/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">

  <script src="./static/js/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">StreamDiT: Supplementary Materials</h1>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- Real-Time Streaming Video Generation Videos Section -->
<section class="section">
  <div class="container is-max-desktop">
    <h2 class="title is-3 has-text-centered">Real-Time Streaming Video Generation</h2>
    <p class="subtitle is-6 has-text-centered">
      StreamDiT enables real-time text-to-video generation at 16 FPS on a single GPU (H100)
    </p>
    <p class="subtitle is-6 has-text-centered" style="margin-bottom: 2rem;">
      (1 minute long videos)
    </p>

    <div class="columns is-multiline is-centered">
      <!-- Demo Video 1 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/corgi.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      
      <!-- Demo Video 2 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/young_man_reading_book_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      
      <!-- Demo Video 3 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/rally_car_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      
      <!-- Demo Video 4 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/coral_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
  
  <p class="subtitle is-6 has-text-centered" style="margin-top: 2rem; margin-bottom: 2rem;">
    (5 minute long video)
  </p>
  <!-- Long Cat Video Demo -->
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <!-- <div class="column is-full-width"> -->
        <div class="column is-half">
        <!-- <div class="video-container" style="max-width: 800px; margin: 0 auto;"> -->
          <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/cat_long_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- StreamDiT-30B Videos Section -->
<section class="section">
  <div class="container is-max-desktop">
    <h2 class="title is-3 has-text-centered">StreamDiT-30B</h2>
    <p class="subtitle is-6 has-text-centered">
      We applied our method to the 30B model to test its scalability (Note: StreamDiT-30B is not real-time on a single H100)
    </p>
    <div class="columns is-multiline is-centered">
      <!-- Demo Video 1 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/30B_couple_beach_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      
      <!-- Demo Video 2 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/30B_church_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      
      <!-- Demo Video 3 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/30B_california_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      
      <!-- Demo Video 4 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/30B_gallary_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>

      <!-- Demo Video 5 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/30B_porcupine_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      
      <!-- Demo Video 6 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/30B_guitar_compressed.mp4" type="video/mp4">
          </video>
        </div>
      </div>

    </div>
  </div>
</section>




<!-- Interactive Videos Section -->
<section class="section">
  <div class="container is-max-desktop">
    <h2 class="title is-3 has-text-centered">Interactive Video Generation</h2>

    <!--Inference Pipeline -->
    <div class="framework-section">
      <h3 class="title is-4">Inference Pipeline</h3>
      <div class="columns is-multiline">
        <div class="column is-full">
          <div class="columns is-vcentered">
            <div class="column is-half">
              <img src="./static/images/inference_pipeline.png"
                    alt="Interactive inference pipeline of StreamDiT"
                    class="framework-image">
            </div>
            <div class="column is-half">
              <div class="content has-text-justified">
                <p>
                  <b>Interactive inference pipeline of StreamDiT:</b> StreamDiT is specifically designed to achieve real-time responsiveness and interactivity, and its inference pipeline is structured accordingly. To decrease latency, the DiT denoiser, TAE (VAE) decoder, and text encoder run in separate processes. A prompt callback function operates continuously, listening for new user prompts in real time. When a user provides a new prompt, it is converted into a text embedding by the text encoders, and the embedding is sent to the DiT thread to update the existing embedding. Subsequent denoising steps then use this updated embedding through a cross-attention mechanism, dynamically adjusting the direction of text guidance. This design enables users to interactively influence and modify video content in real time through prompt inputs.
                </p>
              </div>
            </div>
          </div>
        </div>

      </div>
    </div>

    <div class="columns is-multiline is-centered">
      <!-- Demo Video 1 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/kid_biking_seasons_compressed.mp4" type="video/mp4">
          </video>
        </div>
        <p class="video-caption has-text-centered"><strong>A little boy riding his bike in a garden in spring. -> A little boy riding his bike in a garden in summer. -> A little boy riding his bike in a garden in fall. -> A little boy riding his bike in a garden in winter.</strong></p>
      </div>
      
      <!-- Demo Video 2 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/cat_to_tiger_compressed.mp4" type="video/mp4">
          </video>
        </div>
        <p class="video-caption has-text-centered"><strong>A cat is walking in a garden. -> A tiger is walking in a garden.</strong></p>
      </div>
      
      <!-- Demo Video 3 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/day2night_fireworks_compressed.mp4" type="video/mp4">
          </video>
        </div>
        <p class="video-caption has-text-centered"><strong>Serene nature with a calm lake and cloudy sky in daylight. -> Quiet lake at night under a glowing moon and fading twilight. -> Fireworks exploding over a lake.</strong></p>
      </div>
      
      <!-- Demo Video 4 -->
      <div class="column is-half">
        <div class="video-container">
          <video controls autoplay muted loop style="width: 100%; height: 100%;">
            <source src="./static/videos/desert_cyber_punk_compressed.mp4" type="video/mp4">
          </video>
        </div>
        <p class="video-caption has-text-centered"><strong>A man is walking in a desert. -> A man is walking in a cyberpunk city.</strong></p>
      </div>

    <!-- Demo Video 5 -->
    <div class="column is-half">
      <div class="video-container">
        <video controls autoplay muted loop style="width: 100%; height: 100%;">
          <source src="./static/videos/horse_to_cheetah_compressed.mp4" type="video/mp4">
        </video>
      </div>
      <p class="video-caption has-text-centered"><strong>A horse is running on a grassland. -> A cheetah is running on a grassland. -> A horse is running on a grassland.</strong></p>
    </div>
    
    <!-- Demo Video 6 -->
    <div class="column is-half">
      <div class="video-container">
        <video controls autoplay muted loop style="width: 100%; height: 100%;">
          <source src="./static/videos/desert_to_beach_compressed.mp4" type="video/mp4">
        </video>
      </div>
      <p class="video-caption has-text-centered"><strong>A man is walking on a desert. -> A man is walking towards a beach. -> A man is walking on a beach.</strong></p>
    </div>

    </div>
  </div>
</section>

<!-- Results Section -->
<section class="section">
  <div class="container is-max-desktop">
    <h2 class="title is-3 has-text-centered">Results</h2>
    
    <!-- Performance Metrics -->
    <div class="block">
      <div class="columns is-centered has-text-centered">
        <div class="column is-full-width">
          <h3 class="title is-4">Performance Comparison</h3>
          <img src="./static/images/best_model_comparison_result.png"
               alt="Performance comparison showing StreamDiT's efficiency and quality."
               class="framework-image">
          <div class="content has-text-justified">
            <p>
              <b>StreamDiT-4B</b> achieves real-time performance at 16 FPS on a single GPU while maintaining competitive quality with existing methods. Our model generates 512p video streams with temporal consistency and high visual fidelity.
            </p>
          </div>
        </div>
      </div>
    </div>

    <!-- Comparisons with Existing Works -->
    <div class="block">
      <div class="container is-max-desktop">
        <h3 class="title is-4 has-text-centered" style="margin-top: 1rem;">Comparisons with Existing Works</h3>
        <p class="subtitle is-6 has-text-centered">
          We implemented the existing methods in our base 4B T2V model to perform apples-to-apples comparisons with StreamDiT
        </p>

        <div class="columns is-multiline is-centered">
          <!-- Reuse and Diffuse Green Dress Man -->
          <div class="column is-half">
            <div class="video-container">
              <video controls autoplay muted loop style="width: 100%; height: 100%;">
                <source src="./static/videos/reuse_and_diffuse_green_dress_man_compressed.mp4" type="video/mp4">
              </video>
            </div>
            <p class="video-caption has-text-centered"><strong>Reuse and Diffuse</strong></p>
          </div>
          
          <!-- FIFO Green Dress Man -->
          <div class="column is-half">
            <div class="video-container">
              <video controls autoplay muted loop style="width: 100%; height: 100%;">
                <source src="./static/videos/FIFO_green_dress_man_compressed.mp4" type="video/mp4">
              </video>
            </div>
            <p class="video-caption has-text-centered"><strong>FIFO-Diffusion</strong></p>
          </div>
          
          <!-- StreamDiT Teacher Model Green Dress Man -->
          <div class="column is-half">
            <div class="video-container">
              <video controls autoplay muted loop style="width: 100%; height: 100%;">
                <source src="./static/videos/StreamDiT_teacher_model_green_dress_man_compressed.mp4" type="video/mp4">
              </video>
            </div>
            <p class="video-caption has-text-centered"><strong>Ours (Teacher)</strong></p> 
          </div>
          
          <!-- StreamDiT Distilled Model Green Dress Man -->
          <div class="column is-half">
            <div class="video-container">
              <video controls autoplay muted loop style="width: 100%; height: 100%;">
                <source src="./static/videos/StreamDiT_distilled_model_green_dress_man_compressed.mp4" type="video/mp4">
              </video>
            </div>
            <p class="video-caption has-text-centered"><strong>Ours (Distilled)</strong></p>
          </div>
        <p style="margin-bottom: 1rem;">
          <b>Prompt: </b> An old man takes a pleasant stroll in Antarctica during a beautiful sunset. The old man wears a bright green dress that reaches down to his ankles, and a wide-brimmed sun hat that shields his face from the sun. The man's skin is weathered and wrinkled, with a kind face and a gentle smile. He walks slowly and deliberately, taking in the breathtaking scenery around him. The Antarctic landscape stretches out behind him, with snow-covered peaks and ice shelves glistening in the fading light. The sky above is a kaleidoscope of colors, with hues of pink, orange, and purple blending together in a beautiful sunset. The man's shadow stretches out across the snow as he walks, with the sun casting a warm glow over the entire scene. The lighting is soft and golden, with the sunset casting long shadows across the icy landscape. The video is shot in a cinematic style.
        </p>

        <!-- Reuse and Diffuse NYC Submerged -->
        <div class="column is-half">
          <div class="video-container">
            <video controls autoplay muted loop style="width: 100%; height: 100%;">
              <source src="./static/videos/reuse_and_diffuse_NYC_submerged_compressed.mp4" type="video/mp4">
            </video>
          </div>
          <p class="video-caption has-text-centered"><strong>Reuse and Diffuse</strong></p>
        </div>
        
        <!-- FIFO NYC Submerged -->
        <div class="column is-half">
          <div class="video-container">
            <video controls autoplay muted loop style="width: 100%; height: 100%;">
              <source src="./static/videos/FIFO_NYC_submerged_compressed.mp4" type="video/mp4">
            </video>
          </div>
          <p class="video-caption has-text-centered"><strong>FIFO-Diffusion</strong></p>
        </div>

        <!-- StreamDiT Teacher Model NYC Submerged -->
        <div class="column is-half">
          <div class="video-container">
            <video controls autoplay muted loop style="width: 100%; height: 100%;">
              <source src="./static/videos/StreamDiT_teacher_model_NYC_submerged_compressed.mp4" type="video/mp4">
            </video>
          </div>
          <p class="video-caption has-text-centered"><strong>Ours (Teacher)</strong></p>
        </div>
        
        <!-- StreamDiT Distilled Model NYC Submerged -->
        <div class="column is-half">
          <div class="video-container">
            <video controls autoplay muted loop style="width: 100%; height: 100%;">
              <source src="./static/videos/StreamDiT_distilled_model_NYC_submerged_compressed.mp4" type="video/mp4">
            </video>
          </div>
          <p class="video-caption has-text-centered"><strong>Ours (Distilled)</strong></p>
        </div>

        <p style="margin-bottom: 1rem;">
          <b>Prompt: </b> Camera tracking shot. New York City is submerged underwater like Atlantis. The city's skyscrapers and buildings are covered in coral and seaweed, with schools of fish darting in and out of the windows. A large whale swims down the middle of the street, its massive body gliding effortlessly through the water. Sea turtles and sharks of various species swim through the streets, some swimming alongside the whale. The Empire State Building and the Statue of Liberty are visible in the distance, covered in coral and anemones. The streetlights are still on, casting a warm glow over the scene. The water is a deep blue, with a few rays of sunlight filtering down from above. The fish and other sea creatures are swimming and playing in the streets, as if they have always lived there. The video is shot in a cinematic style.
        </p>

        </div>
      </div>
    </div>


  </div>
</section>


</body>
</html>
