<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8" />
    <meta
      name="description"
      content="STR-Match: Matching SpatioTemporal Relevance Score for Training-Free Video Editing"
    />
    <meta name="keywords" content="STR_Match" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <title>
      STR-Match: Matching SpatioTemporal Relevance Score for Training-Free Video
      Editing
    </title>

    <link
      href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
      rel="stylesheet"
    />

    <link rel="stylesheet" href="./static/css/bulma.min.css" />
    <link rel="stylesheet" href="./static/css/bulma-carousel.min.css" />
    <link rel="stylesheet" href="./static/css/bulma-slider.min.css" />
    <link rel="stylesheet" href="./static/css/fontawesome.all.min.css" />
    <link
      rel="stylesheet"
      href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"
    />
    <link rel="stylesheet" href="./static/css/index.css" />

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script defer src="./static/js/fontawesome.all.min.js"></script>
    <script src="./static/js/bulma-carousel.min.js"></script>
    <script src="./static/js/bulma-slider.min.js"></script>
    <script src="./static/js/index.js"></script>
  </head>
  <body>
    <nav class="navbar" role="navigation" aria-label="main navigation">
      <div class="navbar-brand">
        <a
          role="button"
          class="navbar-burger"
          aria-label="menu"
          aria-expanded="false"
        >
          <span aria-hidden="true"></span>
          <span aria-hidden="true"></span>
          <span aria-hidden="true"></span>
        </a>
      </div>
    </nav>

    <section class="hero">
      <div class="hero-body">
        <div class="container is-max-desktop">
          <div class="columns is-centered">
            <div class="column has-text-centered">
              <h1 class="title is-2 publication-title">
                STR-Match: Matching SpatioTemporal Relevance Score for
                Training-Free Video Editing
              </h1>
            </div>
          </div>
        </div>
      </div>
    </section>
    <section class="comparison">
      <div
        id="loading"
        style="text-align: center; font-size: 20px; margin: 50px"
      >
        Loading videos...
      </div>
      <div id="content" style="display: none">
        <div class="container is-max-desktop">
          <div class="columns is-centered has-text-centered">
            <div class="column is-four-fifths">
              <div
                class="notification is-info is-light"
                style="margin-bottom: 30px; text-align: left"
              >
                <p>
                  <strong>Note:</strong> Due to the size limitations of
                  supplementary materials, we resized the result videos
                  generated by LaVie and Zeroscope to 256×256. The original
                  resolutions are 320×512 for LaVie and 320×480 for Zeroscope,
                  which can be verified in ./results/original.
                </p>
              </div>

              <h2
                class="title is-2 has-text-centered has-text-weight-bold has-text-primary"
                style="
                  margin-top: 20px;
                  margin-bottom: 40px;
                  letter-spacing: -1px;
                "
              >
                ✨ Qualitative Comparison (with LaVie) ✨
              </h2>

              <div
                class="navigation"
                style="
                  display: flex;
                  justify-content: center;
                  align-items: center;
                  gap: 15px;
                  margin-bottom: 20px;
                "
              >
                <button id="prevBtn" class="nav-btn">◀</button>
                <h2 id="folderTitle" class="title is-4" style="margin: 0"></h2>
                <button id="nextBtn" class="nav-btn">▶</button>
              </div>

              <div
                id="videoGrid"
                style="
                  display: grid;
                  grid-template-columns: repeat(5, 1fr);
                  grid-template-rows: repeat(2, auto);
                  gap: 15px;
                  justify-items: center;
                  align-items: center;
                "
              ></div>
            </div>
          </div>
        </div>
      </div>

      <style>
        .nav-btn {
          background-color: white;
          border: 2px solid black;
          border-radius: 50%;
          width: 40px;
          height: 40px;
          font-size: 20px;
          font-weight: bold;
          color: black;
          cursor: pointer;
          display: flex;
          align-items: center;
          justify-content: center;
        }

        .video-container div {
          font-weight: normal;
        }

        .video-container[data-label="source"] div,
        .video-container[data-label="ours_nomask"] div,
        .video-container[data-label="ours_mask"] div {
          font-weight: bold;
        }
      </style>

      <script>
        const folders = [
          "cat_robotdog",
          "cat_dragon",
          "cat_basketball",
          "cat_giraffe",
          "fish_snake",
          "fish_potato",
          "fish_donut",
          "fish_clownfish",
          "baby_sleeping",
          "bird_cat",
          "cat_lion",
          "dog_cat",
          "fish_shark",
          "lotus_daisy",
          "zebra_horse",
        ];

        const videoNames = [
          "ours_nomask",
          "ours_mask",
          "dmt",
          "uniedit",
          "videograin",
          "flatten",
          "fatezero",
          "gav",
        ];

        const folderDisplayMap = {
          cat_robotdog: "Cat → Dog Robot",
          cat_dragon: "Cat → Dragon",
          cat_basketball: "Cat → Basketball",
          cat_giraffe: "Cat → Giraffe",
          fish_snake: "Goldfish → Snake",
          fish_potato: "Goldfish → Sweet Potato",
          fish_donut: "Goldfish → Donuts",
          fish_clownfish: "Goldfish → Clownfish",
          baby_sleeping: "Baby → Sleeping Baby",
          bird_cat: "Bird → Cat",
          cat_lion: "Cat → Lion",
          dog_cat: "Dog → Cat",
          fish_shark: "Goldfish → Shark",
          lotus_daisy: "Lotus → Daisy",
          zebra_horse: "Zebra → Horse",
        };

        const videoDisplayMap = {
          ours_nomask: "Ours (w/o Mask)",
          ours_mask: "Ours (w/ Mask)",
          dmt: "DMT",
          uniedit: "UniEdit",
          videograin: "VideoGrain",
          flatten: "FLATTEN",
          fatezero: "FateZero",
          gav: "Ground-A-Video",
        };

        let currentIndex = 0;
        let comparisonVideos = [];

        function getFolderDisplayName(folder) {
          return folderDisplayMap[folder] || folder;
        }

        function prepareVideoElements(grid) {
          grid.innerHTML = "";
          comparisonVideos = [];

          const addVideoBlock = (row, col, labelText, id = null) => {
            const container = document.createElement("div");
            container.className = "video-container";
            container.style.textAlign = "center";
            container.style.gridRow = row.toString();
            container.style.gridColumn = col.toString();
            if (id) container.dataset.label = id;

            const video = document.createElement("video");
            if (id) video.dataset.id = id;
            video.muted = true;
            video.playsInline = true;
            video.loop = true;
            video.autoplay = false;
            video.preload = "auto";
            video.style.width = "150px";
            video.style.height = "150px";
            video.style.objectFit = "fill";
            video.controls = false;

            const label = document.createElement("div");
            label.textContent = labelText;
            label.style.marginTop = "2px";

            container.appendChild(video);
            container.appendChild(label);
            grid.appendChild(container);

            comparisonVideos.push({ video, label });
          };

          addVideoBlock(1, 1, "Source Video", "source");

          for (let i = 0; i < videoNames.length; i++) {
            const row = Math.floor(i / 4) + 1;
            const col = (i % 4) + 2;
            addVideoBlock(row, col, "", videoNames[i]);
          }
        }

        function loadVideo(video, src) {
          return new Promise((resolve) => {
            video.src = src;
            video.load();
            video.oncanplaythrough = () => resolve();
            video.onerror = () => {
              console.warn("Could not load", src);
              resolve();
            };
          });
        }

        function renderCurrentFolder() {
          const folder = folders[currentIndex];
          document.getElementById("folderTitle").textContent =
            getFolderDisplayName(folder);

          const [sourceObj, ...restObjs] = comparisonVideos;
          const promises = [];

          promises.push(
            loadVideo(
              sourceObj.video,
              `./results/comparison/${folder}/source.mp4`
            ).then(() => {
              sourceObj.label.textContent = "Source Video";
            })
          );

          restObjs.forEach(({ video, label }, idx) => {
            const name = videoNames[idx];
            const labelText = videoDisplayMap[name] || name;
            promises.push(
              loadVideo(
                video,
                `./results/comparison/${folder}/${name}.mp4`
              ).then(() => {
                label.textContent = labelText;
              })
            );
          });

          return Promise.all(promises).then(() => {
            comparisonVideos.forEach(({ video }) => {
              video.currentTime = 0;
              video.play().catch((e) => console.warn("Playback error:", e));
            });
          });
        }

        document.getElementById("prevBtn").addEventListener("click", () => {
          currentIndex = (currentIndex - 1 + folders.length) % folders.length;
          renderCurrentFolder();
        });

        document.getElementById("nextBtn").addEventListener("click", () => {
          currentIndex = (currentIndex + 1) % folders.length;
          renderCurrentFolder();
        });

        window.addEventListener("DOMContentLoaded", () => {
          const grid = document.getElementById("videoGrid");
          prepareVideoElements(grid);

          Promise.all(
            folders.map((folder) => {
              const dummyVideo = document.createElement("video");
              dummyVideo.preload = "auto";
              dummyVideo.src = `./results/comparison/${folder}/source.mp4`;
              return new Promise((resolve) => {
                dummyVideo.oncanplaythrough = resolve;
                dummyVideo.onerror = resolve;
              });
            })
          ).then(() => {
            document.getElementById("loading").style.display = "none";
            document.getElementById("content").style.display = "block";
            renderCurrentFolder();
          });
        });
      </script>
    </section>

    <div style="height: 70px"></div>

    <section class="hero teaser">
      <div class="container is-max-desktop">
        <div class="hero-body">
          <h2 class="title is-2 has-text-centered" style="margin-bottom: 30px">
            STR-Match Video Editing Results (with LaVie)
          </h2>

          <!-- Comparison Grid with navigation arrows on both sides -->
          <div
            style="
              display: flex;
              justify-content: center;
              align-items: center;
              gap: 20px;
            "
          >
            <button id="prevBtns" class="nav-btn">◀</button>

            <div
              id="comparisonGrid"
              class="video-grid"
              style="
                display: grid;
                grid-template-columns: repeat(4, 1fr);
                grid-template-rows: repeat(2, auto);
                gap: 20px;
                justify-items: center;
                align-items: center;
              "
            ></div>

            <button id="nextBtns" class="nav-btn">▶</button>
          </div>

          <script>
            const videoSets = [
              {
                title: "Set A",
                basePath: "cat",
                source: "cat_source.mp4",
                edits: [
                  { filename: "giraffe.mp4", label: "Cat → Giraffe" },
                  { filename: "tiger.mp4", label: "Cat → Tiger" },
                  { filename: "lion.mp4", label: "Cat → Lion" },
                  { filename: "basketball.mp4", label: "Cat → Basketball" },
                  { filename: "dragon.mp4", label: "Cat → Dragon" },
                  { filename: "robot_dog.mp4", label: "Cat → Dog Robot" },
                ],
              },
              {
                title: "Set B",
                basePath: "fish",
                source: "fish_source.mp4",
                edits: [
                  { filename: "shark.mp4", label: "Goldfish → Shark" },
                  { filename: "donut.mp4", label: "Goldfish → Donut" },
                  { filename: "potato.mp4", label: "Goldfish → Sweet Potato" },
                  { filename: "snake.mp4", label: "Goldfish → Snake" },
                  { filename: "clownfish.mp4", label: "Goldfish → Clownfish" },
                  { filename: "rocks.mp4", label: "Goldfish → Rocks" },
                ],
              },
            ];

            let currentSetIndex = 0;

            function createGridItem({ src, label, gridRow, gridColumn }) {
              const wrapper = document.createElement("div");
              wrapper.style.gridRow = gridRow;
              wrapper.style.gridColumn = gridColumn;
              wrapper.style.textAlign = "center";

              const video = document.createElement("video");
              video.src = src;
              video.muted = true;
              video.playsInline = true;
              video.loop = true;
              video.autoplay = true;
              video.preload = "auto";
              video.style.width = "210px";
              video.style.height = "210px";
              video.style.objectFit = "cover";
              video.style.aspectRatio = "1 / 1";

              video.addEventListener("loadedmetadata", () => {
                if (video.duration > 0) {
                  video.playbackRate = video.duration / 2;
                }
              });

              const caption = document.createElement("div");
              caption.textContent = label;
              caption.style.marginTop = "6px";

              wrapper.appendChild(video);
              wrapper.appendChild(caption);
              return wrapper;
            }

            function renderCurrentSet() {
              const grid = document.getElementById("comparisonGrid");
              grid.innerHTML = "";

              const set = videoSets[currentSetIndex];

              const sourceBlock = createGridItem({
                src: `./results/thumbnail/${set.basePath}/${set.source}`,
                label: "Source Video",
                gridRow: "1 / span 2",
                gridColumn: "1",
              });
              grid.appendChild(sourceBlock);

              set.edits.forEach((edit, i) => {
                const row = Math.floor(i / 3) + 1;
                const col = (i % 3) + 2;
                const block = createGridItem({
                  src: `./results/thumbnail/${set.basePath}/${edit.filename}`,
                  label: edit.label,
                  gridRow: row.toString(),
                  gridColumn: col.toString(),
                });
                grid.appendChild(block);
              });
            }

            document.addEventListener("DOMContentLoaded", () => {
              document
                .getElementById("prevBtns")
                .addEventListener("click", () => {
                  currentSetIndex =
                    (currentSetIndex - 1 + videoSets.length) % videoSets.length;
                  renderCurrentSet();
                });

              document
                .getElementById("nextBtns")
                .addEventListener("click", () => {
                  currentSetIndex = (currentSetIndex + 1) % videoSets.length;
                  renderCurrentSet();
                });

              renderCurrentSet();
            });
          </script>

          <!-- Background Change Section -->
          <div
            id="customVideoRow"
            style="
              margin-top: 40px;
              display: flex;
              justify-content: center;
              align-items: flex-start;
              gap: 20px;
            "
          ></div>

          <script>
            window.addEventListener("load", function () {
              const videoData = [
                {
                  group: "Large Motion",
                  videos: [
                    {
                      src: "./results/thumbnail/thirsty_dog/source.mp4",
                      caption: "Source Video",
                    },
                    {
                      src: "./results/thumbnail/thirsty_dog/ours_1_24.mp4",
                      caption: "Dog → Cat",
                    },
                  ],
                },
                {
                  group: "Multi-Object",
                  videos: [
                    {
                      src: "./results/thumbnail/fish_shark/source.mp4",
                      caption: "Source Video",
                    },
                    {
                      src: "./results/thumbnail/fish_shark/target_shark.mp4",
                      caption: "Goldfish → Shark",
                    },
                  ],
                },
                {
                  group: "Background Change",
                  videos: [
                    {
                      src: "./results/thumbnail/sky/kettle_source.mp4",
                      caption: "Source Video",
                    },
                    {
                      src: "./results/thumbnail/sky/kettle_output.mp4",
                      caption: "Orange →\nPerfectly Clear Blue",
                    },
                  ],
                },
              ];

              const container = document.getElementById("customVideoRow");

              videoData.forEach((group) => {
                const groupDiv = document.createElement("div");
                groupDiv.style.display = "flex";
                groupDiv.style.flexDirection = "column";
                groupDiv.style.alignItems = "center";
                groupDiv.style.flex = "none";

                const title = document.createElement("div");
                title.textContent = group.group;
                title.style.fontWeight = "bold";
                title.style.fontSize = "1.3em";
                title.style.marginBottom = "10px";
                groupDiv.appendChild(title);

                const videoRow = document.createElement("div");
                videoRow.style.display = "flex";
                videoRow.style.justifyContent = "center";
                videoRow.style.gap = "10px";

                group.videos.forEach(({ src, caption }) => {
                  const wrapper = document.createElement("div");
                  wrapper.style.display = "flex";
                  wrapper.style.flexDirection = "column";
                  wrapper.style.alignItems = "center";

                  const video = document.createElement("video");
                  video.src = src;
                  video.muted = true;
                  video.autoplay = true;
                  video.loop = true;
                  video.playsInline = true;
                  video.style.width = "180px";
                  video.style.height = "180px";
                  video.style.objectFit = "cover";
                  video.style.aspectRatio = "1 / 1";
                  video.style.display = "block";

                  video.addEventListener("loadedmetadata", () => {
                    if (video.duration > 0) {
                      video.playbackRate = video.duration / 2;
                    }
                  });

                  const captionDiv = document.createElement("div");
                  captionDiv.textContent = caption;
                  captionDiv.style.marginTop = "5px";
                  captionDiv.style.textAlign = "center";
                  captionDiv.style.whiteSpace = "pre-wrap";

                  wrapper.appendChild(video);
                  wrapper.appendChild(captionDiv);
                  videoRow.appendChild(wrapper);
                });

                groupDiv.appendChild(videoRow);
                container.appendChild(groupDiv);
              });
            });
          </script>
        </div>
      </div>
    </section>

    <section class="abstract">
      <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
          <div class="column is-four-fifths">
            <h2 class="title is-3">Abstract</h2>
            <div class="content has-text-justified">
              <p>
                Existing text-guided video editing methods often suffer from
                temporal inconsistency, motion distortion, and cross-domain
                transformation error. We attribute these limitations to
                insufficient modeling of spatio-temporal pixel relevance during
                the editing process.
              </p>
              <p>
                To address this, we propose STR-Match, a training-free video
                editing technique that produces visually appealing and
                temporally coherent videos through latent optimization guided by
                our novel STR score. The proposed score captures spatio-temporal
                pixel relevance across adjacent frames by leveraging 2D spatial
                attention and 1D temporal attention maps in text-to-video~(T2V)
                diffusion models, without the overhead of computationally
                expensive full 3D attention.
              </p>
              <p>
                Integrated into a latent optimization framework with a latent
                mask, STR-Match generates high-fidelity videos with strong
                spatio-temporal consistency, preserving key visual attributes of
                the source video while remaining robust under significant domain
                shifts. Our extensive experiments demonstrate that STR-Match
                consistently outperforms existing methods in both visual quality
                and spatio-temporal consistency.
              </p>
            </div>
          </div>
        </div>
      </div>
    </section>

    <div style="height: 70px"></div>

    <section class="overall-framework">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column">
            <h2 class="title is-3">STR score</h2>
            <div class="columns is-centered">
              <div class="column content">
                <p>
                  The STR score captures spatiotemporal pixel relevance across
                  frames using 2D spatial and 1D temporal attention, enabling
                  flexible shape transformation while preserving key source
                  attributes.
                </p>
                <img src="./figures/approx_joint.png" type="png" />
              </div>
            </div>
          </div>
          <!-- Visual Effects. -->
          <div class="column">
            <div class="content">
              <h2 class="title is-3">Overall Framework</h2>
              <p>
                STR-Match first extracts STR score from a T2V model, then
                optimizes the target latent using these scores and (negative)
                cosine similarity. A binary mask can optionally be used to
                preserve specific regions.
              </p>
              <img src="./figures/stflow_figure.png" type="png" />
            </div>
          </div>
          <!--/ Visual Effects. -->

          <!-- STR Score -->
        </div>
      </div>
    </section>

    <div style="height: 20px"></div>

    <section class="section">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column is-full-width">
            <h2 class="title is-3">STR-Match with CogVideoX</h2>

            <div class="content has-text-justified">
              <p>
                We compare STR-Match with CogVideoX-V2V, demonstrating the
                effectiveness of our method across different video diffusion
                models. The following videos show source videos, our results
                without masks, and CogVideoX-V2V results.
              </p>
            </div>

            <div class="columns is-centered has-text-centered">
              <div class="column is-full-width">
                <div id="cogVideoXSection"></div>
              </div>
            </div>
          </div>
        </div>
      </div>

      <script>
        const cogVideoXFolders = [
          "36",
          "66",
          "1",
          "6",
          "16",
          "19",
          "21",
          "51",
          "79",
        ];

        const cogVideoXTransformations = {
          1: "Bus → Car",
          16: "Penguin → Puffin",
          19: "Kangaroo → Wallaby",
          21: "Elephant → Rhino",
          36: "Swan → Lamp",
          51: "Balloon → Cabinet",
          6: "Bear → Lion",
          66: "Book → Soccer ball",
          79: "Octopus → Guitar",
        };

        const cogVideoOrder = [
          { name: "source", label: (folder) => "Source Video" },
          { name: "target", label: (folder) => "Ours (w/o Mask)" },
          { name: "v2v", label: (folder) => "CogVideoX-V2V" },
        ];

        function createCogVideoBlock(folder, videoName, label) {
          const container = document.createElement("div");
          container.style.textAlign = "center";
          container.style.display = "flex";
          container.style.flexDirection = "column";
          container.style.alignItems = "center";

          const video = document.createElement("video");
          video.src = `./results/cogvideox/${videoName}/${folder}.mp4`;
          video.loop = true;
          video.muted = true;
          video.playsInline = true;
          video.preload = "auto";
          video.autoplay = true;
          video.controls = false;
          video.style.width = "180px";
          video.style.height = "180px";
          video.style.objectFit = "fill";

          video.addEventListener("loadedmetadata", () => {
            if (video.duration > 0) {
              video.playbackRate = video.duration / 2;
            }
          });

          const caption = document.createElement("div");
          caption.textContent = label;
          caption.style.marginTop = "6px";
          caption.style.fontSize = "0.95em";

          container.appendChild(video);
          container.appendChild(caption);
          return container;
        }

        function renderCogVideoXAll() {
          const section = document.getElementById("cogVideoXSection");
          section.innerHTML = "";

          // Create navigation
          const navContainer = document.createElement("div");
          navContainer.style.display = "flex";
          navContainer.style.justifyContent = "center";
          navContainer.style.alignItems = "center";
          navContainer.style.gap = "15px";
          navContainer.style.marginBottom = "20px";

          const prevBtn = document.createElement("button");
          prevBtn.textContent = "◀";
          prevBtn.className = "nav-btn";
          prevBtn.id = "cogVideoPrevBtn";

          const title = document.createElement("h3");
          title.className = "title is-4";
          title.style.margin = "0";
          title.id = "cogVideoTitle";

          const nextBtn = document.createElement("button");
          nextBtn.textContent = "▶";
          nextBtn.className = "nav-btn";
          nextBtn.id = "cogVideoNextBtn";

          navContainer.appendChild(prevBtn);
          navContainer.appendChild(title);
          navContainer.appendChild(nextBtn);
          section.appendChild(navContainer);

          // Create video grid
          const videoGrid = document.createElement("div");
          videoGrid.id = "cogVideoGrid";
          videoGrid.style.display = "grid";
          videoGrid.style.gridTemplateColumns = "repeat(3, 1fr)";
          videoGrid.style.gap = "10px";
          videoGrid.style.justifyItems = "center";
          videoGrid.style.alignItems = "center";
          section.appendChild(videoGrid);

          let currentCogIndex = 0;

          function renderCurrentCogVideo() {
            const folder = cogVideoXFolders[currentCogIndex];
            const grid = document.getElementById("cogVideoGrid");
            const titleEl = document.getElementById("cogVideoTitle");

            titleEl.textContent = cogVideoXTransformations[folder];
            grid.innerHTML = "";

            cogVideoOrder.forEach(({ name, label }) => {
              const block = createCogVideoBlock(folder, name, label(folder));
              grid.appendChild(block);
            });
          }

          document
            .getElementById("cogVideoPrevBtn")
            .addEventListener("click", () => {
              currentCogIndex =
                (currentCogIndex - 1 + cogVideoXFolders.length) %
                cogVideoXFolders.length;
              renderCurrentCogVideo();
            });

          document
            .getElementById("cogVideoNextBtn")
            .addEventListener("click", () => {
              currentCogIndex = (currentCogIndex + 1) % cogVideoXFolders.length;
              renderCurrentCogVideo();
            });

          renderCurrentCogVideo();
        }

        window.addEventListener("DOMContentLoaded", renderCogVideoXAll);
      </script>
    </section>

    <div style="height: 20px"></div>

    <section class="section">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column is-full-width">
            <h2 class="title is-3">Object Deletion/Addition</h2>

            <div class="content has-text-justified">
              <p>
                STR-Match supports flexible object manipulation including
                deletion and addition. The following videos demonstrate our
                method's capability to seamlessly remove or add objects while
                maintaining temporal consistency and visual quality.
              </p>
            </div>

            <div class="columns is-centered has-text-centered">
              <div class="column is-full-width">
                <div id="delAddSection"></div>
              </div>
            </div>
          </div>
        </div>
      </div>

      <script>
        const delVideos = ["0", "3", "4", "5", "6", "8"];
        const addVideos = ["0", "1", "4", "5"];

        const delTransformations = {
          0: "Deleting <strong>[jellyfish]</strong>",
          3: "Deleting <strong>[ball]</strong>",
          4: "Deleting <strong>[bear]</strong>",
          5: "Deleting <strong>[duck]</strong>",
          6: "Deleting <strong>[bicycle]</strong>",
          8: "Deleting <strong>[ginipig]</strong>",
        };

        const addTransformations = {
          0: "Adding <strong>[UFO]</strong>",
          1: "Adding <strong>[flowers]</strong>",
          4: "Adding <strong>[statue]</strong>",
          5: "Adding <strong>[building]</strong>",
        };

        function createDelAddPair(type, videoId, transformation) {
          const pairContainer = document.createElement("div");
          pairContainer.style.display = "flex";
          pairContainer.style.flexDirection = "column";
          pairContainer.style.alignItems = "center";
          pairContainer.style.gap = "5px";

          const videoRow = document.createElement("div");
          videoRow.style.display = "flex";
          videoRow.style.gap = "10px";
          videoRow.style.alignItems = "center";

          // Source video
          const sourceVideo = document.createElement("video");
          sourceVideo.src = `./results/del_add/${type}/source_${videoId}.mp4`;
          sourceVideo.loop = true;
          sourceVideo.muted = true;
          sourceVideo.playsInline = true;
          sourceVideo.preload = "auto";
          sourceVideo.autoplay = true;
          sourceVideo.controls = false;
          sourceVideo.style.width = "160px";
          sourceVideo.style.height = "160px";
          sourceVideo.style.objectFit = "fill";

          sourceVideo.addEventListener("loadedmetadata", () => {
            if (sourceVideo.duration > 0) {
              sourceVideo.playbackRate = sourceVideo.duration / 2;
            }
          });

          // Target video
          const targetVideo = document.createElement("video");
          targetVideo.src = `./results/del_add/${type}/target_${videoId}.mp4`;
          targetVideo.loop = true;
          targetVideo.muted = true;
          targetVideo.playsInline = true;
          targetVideo.preload = "auto";
          targetVideo.autoplay = true;
          targetVideo.controls = false;
          targetVideo.style.width = "160px";
          targetVideo.style.height = "160px";
          targetVideo.style.objectFit = "fill";

          targetVideo.addEventListener("loadedmetadata", () => {
            if (targetVideo.duration > 0) {
              targetVideo.playbackRate = targetVideo.duration / 2;
            }
          });

          videoRow.appendChild(sourceVideo);
          videoRow.appendChild(targetVideo);

          // Caption below the pair
          const caption = document.createElement("div");
          caption.innerHTML = transformation;
          caption.style.textAlign = "center";
          caption.style.fontSize = "0.9em";
          caption.style.fontWeight = "normal";

          pairContainer.appendChild(videoRow);
          pairContainer.appendChild(caption);
          return pairContainer;
        }

        function renderDelAddAll() {
          const section = document.getElementById("delAddSection");
          section.innerHTML = "";

          // Deletion section
          const delTitle = document.createElement("h3");
          delTitle.className = "title is-4";
          delTitle.textContent = "Object Deletion (with LaVie)";
          delTitle.style.marginBottom = "20px";
          section.appendChild(delTitle);

          const delGrid = document.createElement("div");
          delGrid.style.display = "grid";
          delGrid.style.gridTemplateColumns = "repeat(2, 1fr)";
          delGrid.style.gap = "30px";
          delGrid.style.justifyItems = "center";
          delGrid.style.marginBottom = "40px";

          delVideos.forEach((videoId) => {
            const pairBlock = createDelAddPair(
              "del",
              videoId,
              delTransformations[videoId]
            );
            delGrid.appendChild(pairBlock);
          });

          section.appendChild(delGrid);

          // Addition section
          const addTitle = document.createElement("h3");
          addTitle.className = "title is-4";
          addTitle.textContent = "Object Addition (with LaVie)";
          addTitle.style.marginBottom = "20px";
          section.appendChild(addTitle);

          const addGrid = document.createElement("div");
          addGrid.style.display = "grid";
          addGrid.style.gridTemplateColumns = "repeat(2, 1fr)";
          addGrid.style.gap = "30px";
          addGrid.style.justifyItems = "center";

          addVideos.forEach((videoId) => {
            const pairBlock = createDelAddPair(
              "add",
              videoId,
              addTransformations[videoId]
            );
            addGrid.appendChild(pairBlock);
          });

          section.appendChild(addGrid);
        }

        window.addEventListener("DOMContentLoaded", renderDelAddAll);
      </script>
    </section>

    <div style="height: 20px"></div>

    <section class="section">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column is-full-width">
            <h2 class="title is-3">Ablation</h2>

            <h3 class="title is-4">STR-Match (with LaVie) vs Baseline</h3>
            <div class="content has-text-justified">
              <p>
                We compare our method with the baseline method, which uses
                concatenation of self- and temporal-attention instead of STR
                score. The following videos show the results of our method and
                the baseline on three different examples. As observed, our
                method effectively changes the object’s shape in a stable
                manner, whereas the baseline fails to do so and exhibits
                flickering artifacts.
              </p>
            </div>

            <div class="columns is-centered has-text-centered">
              <div class="column is-three-fifths">
                <div id="ablationSection"></div>
              </div>
            </div>
          </div>
        </div>
      </div>

      <script>
        const ablationFolders = [
          "dog_cat_ab",
          "horse_zebra_ab",
          "shark_turtle_ab",
        ];

        const transformationMap = {
          dog_cat_ab: "Dog → Cat",
          horse_zebra_ab: "Horse → Zebra",
          shark_turtle_ab: "Shark → Turtle",
        };

        const videoOrder = [
          {
            name: "source",
            label: (folder) => `${transformationMap[folder]}`,
          },
          { name: "ab", label: () => "Baseline" },
          { name: "ours_nomask", label: () => "Ours (w/o Mask)" },
        ];

        function createVideoBlock(folder, videoName, label) {
          const container = document.createElement("div");
          container.style.textAlign = "center";

          const video = document.createElement("video");
          video.src = `./results/comparison/${folder}/${videoName}.mp4`;
          video.loop = true;
          video.muted = true;
          video.playsInline = true;
          video.preload = "auto";
          video.autoplay = true;
          video.controls = false;
          video.style.width = "170px";
          video.style.height = "170px";
          video.style.objectFit = "fill";

          video.addEventListener("loadedmetadata", () => {
            if (video.duration > 0) {
              video.playbackRate = video.duration / 2;
            }
          });

          const caption = document.createElement("div");
          caption.textContent = label;
          caption.style.marginTop = "6px";

          container.appendChild(video);
          container.appendChild(caption);
          return container;
        }

        function renderAblationAll() {
          const section = document.getElementById("ablationSection");
          section.innerHTML = "";

          ablationFolders.forEach((folder) => {
            const row = document.createElement("div");
            row.style.display = "grid";
            row.style.gridTemplateColumns = "repeat(3, 1fr)";
            row.style.gap = "10px";
            row.style.justifyItems = "center";
            row.style.marginBottom = "20px";

            videoOrder.forEach(({ name, label }) => {
              const block = createVideoBlock(folder, name, label(folder));
              row.appendChild(block);
            });

            section.appendChild(row);
          });
        }

        window.addEventListener("DOMContentLoaded", renderAblationAll);
      </script>
    </section>
    <section class="section">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column is-full-width">
            <h2 class="title is-3">STR-Match with Zeroscope</h2>

            <div class="content has-text-justified">
              <p>
                While STR-Match is demonstrated using LaVie in our main paper,
                it is compatible with any T2V model equipped with temporal
                modules, such as Zeroscope.
              </p>
            </div>

            <div class="columns is-centered has-text-centered">
              <div class="column is-full-width">
                <div id="ablationSection2"></div>
              </div>
            </div>
          </div>
        </div>
      </div>

      <script>
        const videoPairs2 = [
          {
            folder: "cat_dog",
            captionSource: "Source Video",
            captionOurs: "Cat → Dog",
          },
          {
            folder: "fish_clownfish",
            captionSource: "Source Video",
            captionOurs: "Goldfish → Clownfish",
          },
          {
            folder: "rose_tulip",
            captionSource: "Source Video",
            captionOurs: "Red roses →\n Orange Tulips",
          },
        ];

        function createVideoBlock2(folder, filename, label) {
          const container = document.createElement("div");
          container.style.textAlign = "center";
          container.style.display = "flex";
          container.style.flexDirection = "column";
          container.style.alignItems = "center";
          container.style.height = "240px"; // unify total block height

          const video = document.createElement("video");
          video.src = `./results/zeroscope/${folder}/${filename}.mp4`;
          video.loop = true;
          video.muted = true;
          video.playsInline = true;
          video.preload = "auto";
          video.autoplay = true;
          video.controls = false;
          video.style.width = "150px";
          video.style.height = "150px";
          video.style.objectFit = "fill";

          video.addEventListener("loadedmetadata", () => {
            if (video.duration > 0) {
              video.playbackRate = video.duration / 2;
            }
          });

          const caption = document.createElement("div");
          caption.textContent = label;
          caption.style.marginTop = "6px";
          caption.style.fontSize = "0.95em";
          caption.style.textAlign = "center";
          caption.style.whiteSpace = "pre-wrap";
          caption.style.maxWidth = "150px";
          caption.style.lineHeight = "1.2";

          container.appendChild(video);
          container.appendChild(caption);
          return container;
        }

        function renderAblationAll2() {
          const section = document.getElementById("ablationSection2");
          section.innerHTML = "";

          const row = document.createElement("div");
          row.style.display = "grid";
          row.style.gridTemplateColumns = "repeat(6, 1fr)";
          row.style.gap = "15px";
          row.style.justifyItems = "center";
          row.style.alignItems = "start";

          videoPairs2.forEach(({ folder, captionSource, captionOurs }) => {
            const sourceBlock = createVideoBlock2(
              folder,
              "source",
              captionSource
            );
            const resultBlock = createVideoBlock2(
              folder,
              "ours_zeroscope",
              captionOurs
            );
            row.appendChild(sourceBlock);
            row.appendChild(resultBlock);
          });

          section.appendChild(row);
        }

        window.addEventListener("DOMContentLoaded", renderAblationAll2);
      </script>
    </section>
  </body>
</html>
