<!DOCTYPE html>
<html>

<head>
    <meta charset="utf-8" />
    <meta name="description" content="VITA: Vision-To-Action Flow Matching Policy" />
    <meta property="og:title" content="VITA: Vision-To-Action Flow Matching Policy" />
    <meta property="og:description" content="VITA: Vision-To-Action Flow Matching Policy" />
    <meta property="og:image" content="static/images/top.png" />
    <meta property="og:image:width" content="1200" />
    <meta property="og:image:height" content="630" />

    <meta name="twitter:title" content="VITA: Vision-To-Action Flow Matching Policy" />
    <meta name="twitter:description" content="Flowing from Vision to Action" />
    <meta name="twitter:image" content="static/images/top.png" />
    <meta name="twitter:card" content="summary_large_image" />
    <meta name="keywords" content="Flow Matching, Imitation Learning, Robotics Manipulation, Policy Learning" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />

    <title>VITA: Vision-To-Action Flow Matching Policy</title>
    <link rel="icon" type="image/x-icon" href="static/images/favicon.ico" />
    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet" />

    <link rel="stylesheet" href="static/css/bulma.min.css" />
    <link rel="stylesheet" href="static/css/bulma-carousel.min.css" />
    <link rel="stylesheet" href="static/css/bulma-slider.min.css" />
    <link rel="stylesheet" href="static/css/fontawesome.all.min.css" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css" />
    <link rel="stylesheet" href="static/css/index.css" />

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
    <script defer src="static/js/fontawesome.all.min.js"></script>
    <script src="static/js/bulma-carousel.min.js"></script>
    <script src="static/js/bulma-slider.min.js"></script>
    <script src="static/js/index.js"></script>

    <style>
        .publication-title {
            font-family: "Inter", "Google Sans", sans-serif !important;
            font-weight: 700 !important;
            font-size: 2.5rem !important;
            background: linear-gradient(135deg,
                    #1e293b 0%,
                    #6366f1 50%,
                    #8b5cf6 100%) !important;
            -webkit-background-clip: text !important;
            -webkit-text-fill-color: transparent !important;
            background-clip: text !important;
            position: relative;
            opacity: 0;
            animation: title-reveal 0.8s ease-out 1.2s forwards;
            margin-bottom: 1rem !important;
        }

        .publication-title::after {
            content: "";
            position: absolute;
            bottom: -10px;
            left: 50%;
            transform: translateX(-50%);
            width: 0;
            height: 3px;
            background: linear-gradient(135deg, #6366f1, #8b5cf6);
            border-radius: 2px;
            animation: title-underline-grow 0.4s ease-out 2s forwards;
        }

        .publication-title::before {
            content: "";
            position: absolute;
            top: 50%;
            left: 50%;
            width: 800px;
            height: 200px;
            transform: translate(-50%, -50%);
            background: radial-gradient(ellipse 100px 30px at 20% 30%,
                    rgba(99, 102, 241, 0.4) 0%,
                    transparent 70%),
                radial-gradient(ellipse 120px 25px at 60% 50%,
                    rgba(168, 85, 247, 0.4) 0%,
                    transparent 70%),
                radial-gradient(ellipse 80px 35px at 80% 20%,
                    rgba(236, 72, 153, 0.4) 0%,
                    transparent 70%),
                radial-gradient(ellipse 90px 20px at 40% 70%,
                    rgba(99, 102, 241, 0.4) 0%,
                    transparent 70%),
                radial-gradient(ellipse 110px 40px at 10% 60%,
                    rgba(168, 85, 247, 0.4) 0%,
                    transparent 70%),
                linear-gradient(45deg,
                    rgba(99, 102, 241, 0.15) 0%,
                    rgba(168, 85, 247, 0.15) 50%,
                    rgba(236, 72, 153, 0.15) 100%);
            background-size: 200px 100px, 180px 80px, 160px 90px, 190px 70px,
                210px 110px, 400px 200px;
            animation: water-flow-to-title 1.2s cubic-bezier(0.4, 0, 0.2, 1) 0s forwards;
            opacity: 1;
            pointer-events: none;
            z-index: 5;
            border-radius: 50%;
        }

        @keyframes water-flow-to-title {
            0% {
                opacity: 0.8;
                transform: translate(-50%, -50%) scale(3) rotate(0deg);
                background-size: 300px 150px, 280px 130px, 260px 140px, 290px 120px,
                    310px 160px, 600px 300px;
                filter: blur(2px);
            }

            25% {
                opacity: 0.7;
                transform: translate(-50%, -50%) scale(2.2) rotate(90deg);
                background-size: 240px 120px, 220px 100px, 200px 110px, 230px 90px,
                    250px 130px, 500px 250px;
                filter: blur(1.5px);
            }

            50% {
                opacity: 0.6;
                transform: translate(-50%, -50%) scale(1.5) rotate(180deg);
                background-size: 180px 90px, 160px 70px, 140px 80px, 170px 60px,
                    190px 100px, 400px 200px;
                filter: blur(1px);
            }

            75% {
                opacity: 0.4;
                transform: translate(-50%, -50%) scale(0.8) rotate(270deg);
                background-size: 120px 60px, 100px 40px, 80px 50px, 110px 30px,
                    130px 70px, 300px 150px;
                filter: blur(0.5px);
            }

            100% {
                opacity: 0;
                transform: translate(-50%, -50%) scale(0.3) rotate(360deg);
                background-size: 60px 30px, 50px 20px, 40px 25px, 55px 15px, 65px 35px,
                    200px 100px;
                filter: blur(0px);
            }
        }

        @keyframes title-reveal {
            0% {
                opacity: 0;
                transform: translateY(5px);
            }

            100% {
                opacity: 1;
                transform: translateY(0);
            }
        }

        @keyframes title-underline-grow {
            0% {
                width: 0;
                opacity: 0;
            }

            100% {
                width: 120px;
                opacity: 1;
            }
        }

        .highlight-box {
            background: linear-gradient(135deg,
                    rgba(99, 102, 241, 0.12) 0%,
                    rgba(168, 85, 247, 0.12) 50%,
                    rgba(236, 72, 153, 0.12) 100%) !important;
            backdrop-filter: blur(10px) !important;
            border: 1px solid rgba(99, 102, 241, 0.3) !important;
            border-left: 4px solid #6366f1 !important;
            color: #1e293b !important;
            padding: 1.5rem 2rem !important;
            margin: 1.5rem auto !important;
            font-size: 1.15rem !important;
            font-weight: 500 !important;
            border-radius: 16px !important;
            box-shadow: 0 8px 32px rgba(99, 102, 241, 0.15),
                0 2px 8px rgba(0, 0, 0, 0.08) !important;
            max-width: 1000px !important;
            display: block !important;
            text-align: center !important;
            overflow: hidden !important;
            transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
            position: relative;
            white-space: normal !important;
            line-height: 1.5 !important;
        }

        .highlight-box:hover {
            transform: translateY(-2px);
            box-shadow: 0 16px 48px rgba(99, 102, 241, 0.2),
                0 4px 16px rgba(0, 0, 0, 0.1);
        }

        .highlight-box::before {
            content: "";
            position: absolute;
            top: 0;
            left: -100%;
            width: 100%;
            height: 100%;
            background: linear-gradient(90deg,
                    transparent,
                    rgba(255, 255, 255, 0.2),
                    transparent);
            transition: left 0.8s ease;
            pointer-events: none;
        }

        .highlight-box:hover::before {
            left: 100%;
        }

        .highlight-box .icon {
            margin-right: 1rem !important;
            font-size: 1.5rem !important;
            background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
            -webkit-background-clip: text !important;
            -webkit-text-fill-color: transparent !important;
            background-clip: text !important;
            animation: pulse-glow 2s ease-in-out infinite;
            display: inline !important;
        }

        @keyframes pulse-glow {

            0%,
            100% {
                filter: drop-shadow(0 0 4px rgba(99, 102, 241, 0.3));
                transform: scale(1);
            }

            50% {
                filter: drop-shadow(0 0 8px rgba(99, 102, 241, 0.5));
                transform: scale(1.05);
            }
        }

        .cool-keyword {
            font-weight: 700 !important;
            background: linear-gradient(135deg,
                    #6366f1,
                    #8b5cf6,
                    #ec4899) !important;
            -webkit-background-clip: text !important;
            -webkit-text-fill-color: transparent !important;
            background-clip: text !important;
            background-size: 200% 100%;
            animation: keyword-shimmer 3s ease-in-out infinite;
            display: inline !important;
            padding: 0 !important;
            margin: 0 !important;
            border: none !important;
            border-radius: 0 !important;
        }

        .cool-keyword:hover {
            filter: drop-shadow(0 0 4px rgba(99, 102, 241, 0.4));
        }

        @keyframes keyword-shimmer {

            0%,
            100% {
                background-position: 0% 50%;
            }

            50% {
                background-position: 100% 50%;
            }
        }

        .highlight-bold {
            font-weight: 600 !important;
            background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
            -webkit-background-clip: text !important;
            -webkit-text-fill-color: transparent !important;
            background-clip: text !important;
            transition: all 0.3s ease !important;
            position: relative;
            padding: 0 !important;
            margin: 0 !important;
            border: none !important;
            border-radius: 0 !important;
            display: inline !important;
        }

        .highlight-bold:hover {
            filter: drop-shadow(0 0 4px rgba(99, 102, 241, 0.3));
        }

        .title.is-3 {
            background: linear-gradient(135deg, #1e293b 0%, #6366f1 100%);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
            position: relative;
        }

        /* VITA Animation Styles */
        .vita-animation-container {
            width: 100%;
            margin: 1rem auto;
            display: flex;
            align-items: center;
            justify-content: center;
            background: rgba(255, 255, 255, 0.02);
            border-radius: 16px;
            padding: 0.3rem;
            border: 1px solid rgba(99, 102, 241, 0.1);
        }

        .vita-stage {
            width: 100%;
            max-width: 900px;
            min-width: 600px;
            height: 280px;
            position: relative;
            border-radius: 12px;
            background: rgba(255, 255, 255, 0.05);
            backdrop-filter: blur(10px);
            border: 1px solid rgba(255, 255, 255, 0.1);
            overflow: hidden;
        }

        .vita-title {
            position: absolute;
            top: 10px;
            left: 50%;
            transform: translateX(-50%);
            color: #1e293b;
            font-size: clamp(18px, 2.5vw, 24px);
            font-weight: 700;
            text-align: center;
            z-index: 100;
        }

        .vita-subtitle {
            position: absolute;
            top: 40px;
            left: 50%;
            transform: translateX(-50%);
            color: #64748b;
            font-size: clamp(12px, 1.5vw, 16px);
            font-weight: 500;
            text-align: center;
            z-index: 100;
        }

        .vita-pipeline {
            position: absolute;
            top: 70px;
            left: 0;
            right: 0;
            height: 140px;
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 0 0.5%;
            gap: 0.1%;
        }

        .vita-section {
            display: flex;
            flex-direction: column;
            align-items: center;
            justify-content: center;
            position: relative;
            transition: all 0.3s ease;
            height: 100px;
        }

        .vita-image-section {
            width: 16%;
            min-width: 80px;
        }

        .vita-latent-image-section {
            width: 12%;
            min-width: 60px;
            opacity: 0;
            transform: scale(0.8);
        }

        .vita-latent-image-section.visible {
            opacity: 1;
            transform: scale(1);
        }

        .vita-flow-section {
            width: 28%;
            min-width: 180px;
            opacity: 0;
            transform: scale(0.8);
        }

        .vita-flow-section.visible {
            opacity: 1;
            transform: scale(1);
        }

        .vita-latent-action-section {
            width: 12%;
            min-width: 60px;
            opacity: 0;
            transform: scale(0.8);
        }

        .vita-latent-action-section.visible {
            opacity: 1;
            transform: scale(1);
        }

        .vita-action-section {
            width: 16%;
            min-width: 80px;
            opacity: 0;
            transform: scale(0.8);
        }

        .vita-action-section.visible {
            opacity: 1;
            transform: scale(1);
        }

        .vita-robot-image {
            width: 70px;
            height: 70px;
            border-radius: 8px;
            background: #f8fafc;
            border: 2px solid rgba(99, 102, 241, 0.3);
            display: flex;
            align-items: center;
            justify-content: center;
            overflow: hidden;
            transition: all 0.3s ease;
        }

        .vita-robot-image img {
            width: 100%;
            height: 100%;
            object-fit: cover;
            border-radius: 6px;
        }

        .vita-robot-image:hover {
            transform: scale(1.05);
        }

        .vita-latent-container {
            width: 50px;
            height: 70px;
            border-radius: 8px;
            background: linear-gradient(180deg,
                    rgba(59, 130, 246, 0.1) 0%,
                    rgba(59, 130, 246, 0.05) 100%);
            border: 2px solid rgba(59, 130, 246, 0.4);
            position: relative;
            overflow: hidden;
        }

        .vita-flow-pipeline {
            width: 100%;
            height: 70px;
            position: relative;
            border-radius: 8px;
            border-top: 2px solid rgba(139, 92, 246, 0.4);
            border-bottom: 2px solid rgba(139, 92, 246, 0.4);
            overflow: hidden;
            background: linear-gradient(90deg,
                    rgba(59, 130, 246, 0.15) 0%,
                    rgba(139, 92, 246, 0.15) 50%,
                    rgba(34, 197, 94, 0.15) 100%);
            background-size: 200% 200%;
            animation: vita-pipeline-shimmer 4s ease-in-out infinite;
        }

        @keyframes vita-pipeline-shimmer {
            0% {
                background-position: 0% 50%;
            }

            50% {
                background-position: 100% 50%;
            }

            100% {
                background-position: 0% 50%;
            }
        }

        .vita-flow-pipeline::before {
            content: "";
            position: absolute;
            top: -2px;
            left: 0;
            right: 0;
            height: 2px;
            background: linear-gradient(90deg,
                    rgba(59, 130, 246, 0.6) 0%,
                    rgba(139, 92, 246, 0.8) 33%,
                    rgba(34, 197, 94, 0.8) 66%,
                    rgba(16, 185, 129, 0.6) 100%);
        }

        .vita-flow-pipeline::after {
            content: "";
            position: absolute;
            bottom: -2px;
            left: 0;
            right: 0;
            height: 2px;
            background: linear-gradient(90deg,
                    rgba(59, 130, 246, 0.6) 0%,
                    rgba(139, 92, 246, 0.8) 33%,
                    rgba(34, 197, 94, 0.8) 66%,
                    rgba(16, 185, 129, 0.6) 100%);
        }

        .vita-latent-action-container {
            width: 50px;
            height: 70px;
            border-radius: 8px;
            background: linear-gradient(180deg,
                    rgba(16, 185, 129, 0.1) 0%,
                    rgba(16, 185, 129, 0.05) 100%);
            border: 2px solid rgba(16, 185, 129, 0.4);
            position: relative;
            overflow: hidden;
        }

        .vita-action-display {
            width: 70px;
            height: 70px;
            background: linear-gradient(180deg,
                    rgba(245, 101, 101, 0.1) 0%,
                    rgba(239, 68, 68, 0.05) 100%);
            border: 2px solid rgba(245, 101, 101, 0.3);
            border-radius: 8px;
            position: relative;
            overflow: hidden;
        }

        .vita-particle {
            position: absolute;
            width: 4px;
            height: 4px;
            border-radius: 50%;
            pointer-events: none;
            opacity: 0;
            transition: all 0.6s ease-in-out;
        }

        .vita-vision-particle {
            background: radial-gradient(circle, #60a5fa, #3b82f6);
            box-shadow: 0 0 5px rgba(96, 165, 250, 0.8);
        }

        .vita-latent-image-particle {
            background: radial-gradient(circle, #60a5fa, #3b82f6);
            box-shadow: 0 0 5px rgba(96, 165, 250, 0.8);
        }

        .vita-flow-particle {
            background: radial-gradient(circle, #60a5fa, #3b82f6);
            box-shadow: 0 0 5px rgba(96, 165, 250, 0.8);
            transition: all 0.6s cubic-bezier(0.4, 0, 0.2, 1);
        }

        .vita-latent-action-particle {
            background: radial-gradient(circle, #34d399, #10b981);
            box-shadow: 0 0 5px rgba(52, 211, 153, 0.8);
        }

        .vita-label {
            color: #475569;
            font-size: clamp(11px, 1.2vw, 14px);
            font-weight: 700;
            margin-top: 6px;
            text-align: center;
            opacity: 0.9;
            max-width: 100%;
            word-wrap: break-word;
        }

        .vita-action-path {
            position: absolute;
            top: 0;
            left: 0;
            width: 100%;
            height: 100%;
            pointer-events: none;
        }

        .vita-action-curve {
            stroke: #f59e0b;
            stroke-width: 2;
            fill: none;
            stroke-dasharray: 0;
            opacity: 0;
            filter: drop-shadow(0 0 3px rgba(245, 158, 11, 0.6));
        }

        .vita-action-point {
            fill: #f59e0b;
            r: 2.5;
            opacity: 0;
            filter: drop-shadow(0 0 2px rgba(245, 158, 11, 0.8));
        }

        .vita-control-button {
            position: absolute;
            bottom: 10px;
            left: 50%;
            transform: translateX(-50%);
            background: linear-gradient(135deg, #6366f1, #8b5cf6);
            color: white;
            border: none;
            padding: 10px 20px;
            border-radius: 16px;
            font-size: 14px;
            font-weight: 600;
            cursor: pointer;
            transition: all 0.3s ease;
            box-shadow: 0 2px 8px rgba(99, 102, 241, 0.3);
            z-index: 101;
        }

        .vita-control-button:hover {
            transform: translateX(-50%) translateY(-1px);
            box-shadow: 0 3px 12px rgba(99, 102, 241, 0.4);
        }

        .vita-phase-indicator {
            position: absolute;
            bottom: 55px;
            left: 50%;
            color: #475569;
            font-size: 13px;
            font-weight: 600;
            text-align: center;
            white-space: nowrap;
            padding: 4px 10px;
            background-color: rgba(241, 245, 249, 0.7);
            border-radius: 8px;
            transition: all 0.5s ease-in-out;
            transform: translateX(-50%);
            z-index: 100;
        }

        @keyframes vita-drawCurve {
            from {
                stroke-dasharray: 0, 250;
            }

            to {
                stroke-dasharray: 250, 0;
            }
        }

        @keyframes vita-pointAppear {
            to {
                opacity: 1;
            }
        }

        @keyframes vita-curveAppear {
            to {
                opacity: 1;
            }
        }
    </style>
</head>

<body>
    <section class="hero">
        <div class="hero-body">
            <div class="container is-fullhd">
                <div class="columns is-centered">
                    <div class="column has-text-centered">
                        <h1 class="title is-1 publication-title">
                            VITA: Vision-To-Action Flow Matching Policy
                        </h1>
                        <div class="is-size-5 publication-authors">
                            <span class="author-block">
                                Anonymous Authors
                            </span>
                        </div>

                    </div>
                </div>
            </div>
        </div>
    </section>

    <div class="highlight-box">
        <span class="icon"><i class="fas fa-bolt"></i></span>
        VITA is an efficient and performant policy
        that directly flows from <span class="cool-keyword">latent images</span> to
        <span class="cool-keyword">latent actions</span>—<br />without sampling
        from <span class="cool-keyword">Gaussian noise</span> or relying on
        <span class="cool-keyword">conditioning modules</span>.
    </div>

    <!-- VITA Animation Section -->
    <section class="section">
        <div class="container is-max-widescreen">
            <div class="columns is-centered">
                <div class="column is-full-width">
                    <div class="vita-animation-container">
                        <div class="vita-stage">
                            <div class="vita-title">
                                VITA: Vision-to-Action Flow Matching
                            </div>
                            <div class="vita-subtitle">
                                Noise-Free, Conditioning-Free Policy Learning
                            </div>

                            <div class="vita-pipeline">
                                <div class="vita-section vita-image-section">
                                    <div class="vita-robot-image">
                                        <img src="static/images/cam_image.png" alt="Camera Image" />
                                    </div>
                                    <div class="vita-label">Camera Image</div>
                                </div>

                                <div class="vita-section vita-latent-image-section" id="vitaLatentImageSection">
                                    <div class="vita-latent-container" id="vitaLatentImages"></div>
                                    <div class="vita-label">Latent Images</div>
                                </div>

                                <div class="vita-section vita-flow-section" id="vitaFlowSection">
                                    <div class="vita-flow-pipeline" id="vitaFlowPipeline"></div>
                                    <div class="vita-label">Flow Matching</div>
                                </div>

                                <div class="vita-section vita-latent-action-section" id="vitaLatentActionSection">
                                    <div class="vita-latent-action-container" id="vitaLatentActions"></div>
                                    <div class="vita-label">Latent Actions</div>
                                </div>

                                <div class="vita-section vita-action-section" id="vitaActionSection">
                                    <div class="vita-action-display" id="vitaActionDisplay">
                                        <svg class="vita-action-path" viewBox="0 0 100 100">
                                            <path class="vita-action-curve" id="vitaActionCurve"
                                                d="M 15,80 Q 30,25 50,55 Q 70,85 85,35" />
                                        </svg>
                                    </div>
                                    <div class="vita-label">Action Sequence</div>
                                </div>
                            </div>

                            <div class="vita-phase-indicator" id="vitaPhaseText">
                                Ready to start VITA flow
                            </div>
                            <button class="vita-control-button" onclick="startVitaAnimation()">
                                Restart
                            </button>
                        </div>
                    </div>
            </div>
        </div>
        </div>
    </section>

    <!-- Abstract -->
    <section class="section hero is-light">
        <div class="container is-max-desktop">
            <div class="columns is-centered has-text-centered">
                <div class="column is-four-fifths">
                    <h2 class="title is-3">What is VITA?</h2>
                    <div class="content has-text-justified">
                        <p>
                            We present VITA, a VIsion-To-Action flow matching policy that
                            evolves latent visual representations into latent actions via
                            flow matching for visuomotor control. Conventional flow matching
                            and diffusion policies face a fundamental inefficiency: they
                            sample from standard source distributions (e.g., Gaussian noise)
                            and then require additional conditioning mechanisms, such as
                            cross-attention, to repeatedly inject visual inputs at each generation step,
                            incurring time and space overheads.
                            <span class="highlight-bold">We propose VITA, a novel paradigm that treats latent images
                                as the source of the flow, and learns an inherent mapping from
                                vision to action. Because the source of the flow is visually grounded,
                                VITA eliminates the need for repeated conditioning during generation.
                            </span>
                            We evaluate VITA on 9 simulation and 5 real-world tasks from ALOHA and Robomimic.
                            Despite its simplicity, VITA outperforms or matches
                            state-of-the-art policies, while speeding up inference
                            by 1.5x to 2x. VITA inherently enables simpler architectures such as MLPs. To our knowledge,
                            VITA is the first <span class="highlight-bold">MLP-only flow matching policy capable of solving complex
                                bi-manual manipulation tasks like those in ALOHA benchmarks.</span>
                        </p>
                    </div>
                </div>
            </div>
        </div>
    </section>

    <!-- System Diagram -->
    <section class="section hero">
        <div class="container is-max-widescreen">
            <div class="columns is-centered has-text-centered">
                <div class="column is-full-width">
                    <h2 class="title is-3">VITA Framework</h2>

                    <img src="static/images/top.png" class="interpolation-image" />
                    <br />
                    <br />

                    <div class="content has-text-justified">
                        <p>
                            VITA learns a continuous flow from latent visual representations
                            to latent actions. Because the source of the flow is visually grounded,
                            VITA eliminates the need for repeated conditioning during generation.
                        </p>
                    </div>
                </div>
            </div>
        </div>
    </section>

    <section class="section hero is-light">
        <div class="container is-max-widescreen">
            <div class="columns is-centered has-text-centered">
                <div class="column is-full-width">
                    <h2 class="title is-3">VITA Denoising</h2>

                    <div class="content has-text-justified">
                        <p>
                            Comparison of the denoising process between conventional flow matching and VITA.
                            Conventional flow matching denoises random Gaussian into actions, VITA
                            flows from latent images to latent actions.
                            <b>We found that through VITA learning, latent images manifest action semantics.
                                The latent image can decoded into a smooth trajectory, and progressively refined by the ODE process.</b>
                        </p>
                    </div>

                    <img src="static/images/denoising.png" class="interpolation-image" />
                    <br />
                    <br />
                </div>
            </div>
        </div>
    </section>

    <section class="section hero is-light">
        <div class="container is-max-widescreen">
            <div class="columns is-centered has-text-centered">
                <div class="column is-full-width">
                    <h2 class="title is-3">Addressing Latent Collapse of E2E Latent Flow Matching Training</h2>

                    <div class="content has-text-justified">
                        <p>
                            Unlike latent diffusion for image generation, where the target latent space can be trainied via abundant image data,
                            action data is sparse and limited and thus the target latent space is hard to be well pre-trained and frozen as the target for flow matching.
                            Naively end-to-end training flow matching along with the target latent space leads to latent collapse (Figure (a)).
                            We first time identify the cause of the issue as the training-test time gap between encoder-based latents and ODE-generated latents.
                            We propose <b>flow latent decoding (FLD)</b>, to backpropagate through the flow ODE solver during training, to close the gap by anchoring latent representations using ground-truth targets.
                        </p>
                    </div>

                    <img src="static/images/latent_collapase.png" class="interpolation-image" />
                    <br />
                    <br />
                </div>
            </div>
        </div>
    </section>

    <section class="section hero is-light">
        <div class="container is-max-widescreen">
            <div class="columns is-centered has-text-centered">
                <div class="column is-full-width">
                    <h2 class="title is-3">Efficiency</h2>
                    <img src="static/images/efficiency.png" class="interpolation-image" />
                    <br />
                    <br />

                    <div class="content has-text-justified">
                        <p>
                            The table compares the inference latency and peak memory usage of different flow matching policies
                            when using vector-based (Vector) representations or grid-based representations (Grid) for visual features.
                            VITA achieves 1.5x to 2x faster inference speed and reduces memory usage by 18.6% to 28.7%.
                        </p>
                    </div>
                </div>
            </div>
        </div>
    </section>


    <section class="section hero is-light">
        <div class="container is-max-widescreen">
            <div class="columns is-centered has-text-centered">
                <div class="column is-full-width">
                    <h2 class="title is-3">Success Rates</h2>
                    <img src="static/images/perf.png" class="interpolation-image" />
                    <br />
                    <br />

                    <div class="content has-text-justified">
                        <p>
                            We evaluate VITA on challenging bi-manual manipulation tasks, and single-arm tasks
                            including 9 simulation and 5 real-world tasks on ALOHA, AV-ALOHA, Robomimic, covering bimnual and single-arm manipulation tasks.
                            The <b>MLP-only VITA</b> consistently outperforms or matches state-of-the-art 
                            policies (including transformer-based conventional flow matching
                            policy), while being significantly more efficient.
                        </p>
                    </div>
                </div>
            </div>
        </div>
    </section>


    <!-- Real-World Tasks -->
    <section class="hero is-small">
        <div class="hero-body">
            <div class="container">
                <h2 class="title is-3">
                    VITA Demos: Real-World Tasks
                </h2>

                <!-- Phone Camera Views (First Row) -->
                    <div class="columns is-centered">
                        <div class="column is-half">
                        <div class="content has-text-justified">
                            <h2 class="title is-3" align="left">
                                VITA demonstrates robustness to online perturbations.
                            </h2>
                        </div>
                            <video poster="" autoplay controls muted loop height="100%">
                                <source src="static/videos/online_perturb.mp4" type="video/mp4" />
                            </video>
                            <h4 class="subtitle has-text-centered">Online Perturbations</h4>
                        </div>
                        <div class="column is-half">
                    <div class="content has-text-justified">
                        <h2 class="title is-3" align="left">
                            VITA demonstrates generalization to unseen objects.
                        </h2>
                    </div>
                            <video poster="" autoplay controls muted loop height="100%">
                                <source src="static/videos/unseen_object.mp4" type="video/mp4" />
                            </video>
                            <h4 class="subtitle has-text-centered">Unseen Objects</h4>
                        </div>
                    </div>





                <h2 class="title is-3">Bimanual Tasks with Active Vision</h2>
                    <p>
                        Two challenging bimanual manipulation tasks on <a href="https://soltanilara.github.io/av-aloha/" target="_blank" rel="noopener noreferrer">AV-ALOHA</a> with
                        an additional 7-DoF arm carrying an active vision camera. The robot must predict and reach the
                        best viewpoint to avoid occulusions and increase precision.
                    </p>
                <div class="columns is-centered">
                    <div class="column is-half">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/hidden_pick_active.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Hidden Pick</h4>
                    </div>
                    <div class="column is-half">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/transfer_from_box_active.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Transfer From Box</h4>
                    </div>
                </div>
                </div>
        </div>
    </section>

    <!-- Tasks -->
    <section class="hero is-small is-light">
        <div class="hero-body">
            <div class="container">
                <h2 class="title is-3">
                    VITA Demos: Real-World and Simulation Tasks
                </h2>

                <!-- Third Row of Videos -->
                <div class="columns is-centered">
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/hidden_pick.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Hidden Pick</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/transfer_from_box.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Transfer From Box</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/toothbrush.mp4" type="video/mp4" />
                        </video>
                    </div>
                </div>

                <!-- Third Row of Videos -->
                <div class="columns is-centered">
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/pick_ball.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Pick Ball</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/store_drawer.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Store Drawer</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop width="100%"
                            style="aspect-ratio: 16/9; object-fit: cover">
                            <source src="static/videos/thread_needle.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Thread Needle</h4>
                    </div>
                </div>

                <!-- First Row of Simulation Videos -->
                <div class="columns is-centered">
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop width="100%"
                            style="aspect-ratio: 16/9; object-fit: cover">
                            <source src="static/videos/pour_test_tube.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Pour Test Tube</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop width="100%"
                            style="aspect-ratio: 16/9; object-fit: cover">
                            <source src="static/videos/hook_package.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Hook Package</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop width="100%"
                            style="aspect-ratio: 16/9; object-fit: cover">
                            <source src="static/videos/slot_insertion.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Slot Insertion</h4>
                    </div>
                </div>

                <!-- Second Row of Simulation Videos -->
                <div class="columns is-centered">
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop width="100%"
                            style="aspect-ratio: 16/9; object-fit: cover">
                            <source src="static/videos/transfer_cube.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Transfer Cube</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop height="100%">
                            <source src="static/videos/square.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">Square</h4>
                    </div>
                    <div class="column is-one-third">
                        <video poster="" autoplay controls muted loop width="100%"
                            style="aspect-ratio: 16/9; object-fit: cover">
                            <source src="static/videos/pusht.mp4" type="video/mp4" />
                        </video>
                        <h4 class="subtitle has-text-centered">PushT</h4>
                    </div>
                </div>

            </div>
        </div>
    </section>

    <section class="section hero is-light">
        <div class="container is-max-widescreen">
            <div class="columns is-centered has-text-centered">
                <div class="column is-full-width">
                    <h2 class="title is-3">Training Efficiency</h2>
                    <img src="static/images/action_mse.png" class="interpolation-image" />
                    <br />
                    <br />

                    <div class="content has-text-justified">
                        <p>
                            VITA enjoys faster convergence than other policies. We compare the action MSE curves of VITA, FM, DP, and ACT on three real-world tasks. VITA consistently converges faster at lower errors.
                            ACT plateaus early; DP and FM converge slower.
                        </p>
                    </div>
                </div>
            </div>
        </div>
    </section>

    <script>
        // VITA Animation JavaScript
        let vitaIsAnimating = false;
        let vitaAllParticles = [];
        let vitaAnimationTimeout = null;
        let lastPositions = [];

        function vitaGetPointOnCurve(t) {
            if (t <= 0.5) {
                const localT = t * 2;
                const x =
                    Math.pow(1 - localT, 2) * 15 +
                    2 * (1 - localT) * localT * 30 +
                    Math.pow(localT, 2) * 50;
                const y =
                    Math.pow(1 - localT, 2) * 80 +
                    2 * (1 - localT) * localT * 25 +
                    Math.pow(localT, 2) * 55;
                return { x, y };
            } else {
                const localT = (t - 0.5) * 2;
                const x =
                    Math.pow(1 - localT, 2) * 50 +
                    2 * (1 - localT) * localT * 70 +
                    Math.pow(localT, 2) * 85;
                const y =
                    Math.pow(1 - localT, 2) * 55 +
                    2 * (1 - localT) * localT * 85 +
                    Math.pow(localT, 2) * 35;
                return { x, y };
            }
        }

        function vitaCreateParticle(className, container) {
            const particle = document.createElement("div");
            particle.className = `vita-particle ${className}`;
            container.appendChild(particle);
            vitaAllParticles.push(particle);
            return particle;
        }

        function vitaClearAll() {
            clearTimeout(vitaAnimationTimeout);
            vitaAllParticles.forEach((p) => p.remove());
            vitaAllParticles = [];
            lastPositions = [];

            const curve = document.getElementById("vitaActionCurve");
            const actionPoints = document.querySelectorAll(".vita-action-point");
            if (curve) {
                curve.style.animation = "none";
                curve.style.opacity = "0";
            }
            actionPoints.forEach((point) => point.remove());

            document
                .getElementById("vitaLatentImageSection")
                .classList.remove("visible");
            document.getElementById("vitaFlowSection").classList.remove("visible");
            document
                .getElementById("vitaLatentActionSection")
                .classList.remove("visible");
            document
                .getElementById("vitaActionSection")
                .classList.remove("visible");
        }

        function vitaUpdatePhase(text, sectionElement = null) {
            const phaseIndicator = document.getElementById("vitaPhaseText");
            phaseIndicator.textContent = text;

            if (sectionElement) {
                const stageRect = document
                    .querySelector(".vita-stage")
                    .getBoundingClientRect();
                const sectionRect = sectionElement.getBoundingClientRect();
                const newLeft =
                    sectionRect.left + sectionRect.width / 2 - stageRect.left;
                phaseIndicator.style.left = `${newLeft}px`;
            } else {
                phaseIndicator.style.left = "50%";
            }
        }

        function vitaShowSection(sectionId) {
            document.getElementById(sectionId).classList.add("visible");
        }

        function vitaGetRandomPositions(container, count) {
            const rect = container.getBoundingClientRect();
            const stageRect = document
                .querySelector(".vita-stage")
                .getBoundingClientRect();
            const positions = [];

            for (let i = 0; i < count; i++) {
                positions.push({
                    x:
                        rect.left -
                        stageRect.left +
                        6 +
                        Math.random() * (rect.width - 12),
                    y:
                        rect.top - stageRect.top + 6 + Math.random() * (rect.height - 12),
                });
            }
            return positions;
        }

        function vitaCreateVisionParticles() {
            const imageContainer = document.querySelector(".vita-robot-image");
            const positions = vitaGetRandomPositions(imageContainer, 10);

            positions.forEach((pos, i) => {
                const particle = vitaCreateParticle(
                    "vita-vision-particle",
                    document.querySelector(".vita-stage")
                );
                particle.style.left = pos.x + "px";
                particle.style.top = pos.y + "px";

                vitaAnimationTimeout = setTimeout(() => {
                    particle.style.opacity = "1";
                }, i * 50);
            });
        }

        function vitaMoveParticlesToContainer(
            targetContainer,
            newClass,
            preserveLayout = false
        ) {
            return new Promise((resolve) => {
                const targetRect = targetContainer.getBoundingClientRect();
                const stageRect = document
                    .querySelector(".vita-stage")
                    .getBoundingClientRect();

                let positions = [];

                if (preserveLayout && lastPositions.length > 0) {
                    const cloudCenter = lastPositions.reduce(
                        (acc, pos) => {
                            acc.x += pos.x;
                            acc.y += pos.y;
                            return acc;
                        },
                        { x: 0, y: 0 }
                    );
                    cloudCenter.x /= lastPositions.length;
                    cloudCenter.y /= lastPositions.length;

                    const targetCenter = {
                        x: targetRect.left - stageRect.left + targetRect.width / 2,
                        y: targetRect.top - stageRect.top + targetRect.height / 2,
                    };

                    const delta = {
                        x: targetCenter.x - cloudCenter.x,
                        y: targetCenter.y - cloudCenter.y,
                    };

                    positions = lastPositions.map((p) => ({
                        x: p.x + delta.x,
                        y: p.y + delta.y,
                    }));
                } else {
                    positions = vitaGetRandomPositions(
                        targetContainer,
                        vitaAllParticles.length
                    );
                }

                vitaAllParticles.forEach((particle, i) => {
                    const pos = positions[i];
                    particle.style.left = pos.x + "px";
                    particle.style.top = pos.y + "px";
                    particle.className = `vita-particle ${newClass}`;
                });

                vitaAnimationTimeout = setTimeout(resolve, 600);
            });
        }

        function vitaAnimateFlowMatching() {
            return new Promise((resolve) => {
                const flowPipeline = document.getElementById("vitaFlowPipeline");
                const pipelineRect = flowPipeline.getBoundingClientRect();
                const stageRect = document
                    .querySelector(".vita-stage")
                    .getBoundingClientRect();

                const startX = pipelineRect.left - stageRect.left;
                const endX = pipelineRect.right - stageRect.left;
                const centerY =
                    pipelineRect.top + pipelineRect.height / 2 - stageRect.top;

                const steps = 3;
                let currentStep = 1;

                function animateStep() {
                    if (currentStep > steps) {
                        lastPositions = vitaAllParticles.map((p) => ({
                            x: parseFloat(p.style.left),
                            y: parseFloat(p.style.top),
                        }));
                        resolve();
                        return;
                    }

                    const progress = currentStep / steps;

                    vitaAllParticles.forEach((particle) => {
                        const initialX = parseFloat(particle.dataset.initialX);
                        const travelDist = endX - 20 - initialX;
                        const baseX = initialX + travelDist * progress;

                        let x = baseX + (Math.random() - 0.5) * 60 * progress;
                        x = Math.max(startX + 5, Math.min(x, endX - 5));

                        let y =
                            centerY + (Math.random() - 0.5) * (pipelineRect.height - 12);
                        y = Math.max(
                            pipelineRect.top - stageRect.top + 5,
                            Math.min(y, pipelineRect.bottom - stageRect.top - 5)
                        );

                        const blueAmount = 1 - progress;
                        const greenAmount = progress;

                        const r = Math.round(96 * blueAmount + 52 * greenAmount);
                        const g = Math.round(165 * blueAmount + 211 * greenAmount);
                        const b = Math.round(250 * blueAmount + 153 * greenAmount);

                        particle.style.left = x + "px";
                        particle.style.top = y + "px";
                        particle.style.background = `radial-gradient(circle, rgb(${r}, ${g}, ${b}), rgb(${Math.round(
                            r * 0.8
                        )}, ${Math.round(g * 0.8)}, ${Math.round(b * 0.8)}))`;
                        particle.style.boxShadow = `0 0 5px rgba(${r}, ${g}, ${b}, 0.8)`;
                    });

                    currentStep++;
                    vitaAnimationTimeout = setTimeout(animateStep, 600);
                }

                animateStep();
            });
        }

        function vitaClearFlowParticles() {
            vitaAllParticles = vitaAllParticles.filter((particle) => {
                if (particle.classList.contains("vita-latent-action-particle")) {
                    particle.remove();
                    return false;
                }
                return true;
            });
        }

        function vitaCreateActionSequence() {
            const svg = document.querySelector(".vita-action-path");
            const curve = document.getElementById("vitaActionCurve");

            const numPoints = 7;
            for (let i = 0; i < numPoints; i++) {
                const t = i / (numPoints - 1);
                const point = vitaGetPointOnCurve(t);

                const circle = document.createElementNS(
                    "http://www.w3.org/2000/svg",
                    "circle"
                );
                circle.setAttribute("class", "vita-action-point");
                circle.setAttribute("cx", point.x);
                circle.setAttribute("cy", point.y);
                svg.appendChild(circle);
            }

            const points = document.querySelectorAll(".vita-action-point");

            points.forEach((point, i) => {
                vitaAnimationTimeout = setTimeout(() => {
                    point.style.animation = "vita-pointAppear 300ms ease-out forwards";
                }, i * 100);
            });

            vitaAnimationTimeout = setTimeout(() => {
                curve.style.animation =
                    "vita-curveAppear 400ms ease-out forwards, vita-drawCurve 800ms ease-out forwards";
            }, points.length * 100 + 300);
        }

        async function startVitaAnimation() {
            if (vitaIsAnimating) return;

            vitaIsAnimating = true;
            document.querySelector(".vita-control-button").textContent =
                "Running...";

            vitaClearAll();

            const imageSection = document.querySelector(".vita-image-section");
            vitaUpdatePhase("Encoding camera images", imageSection);
            vitaCreateVisionParticles();
            await new Promise((r) => (vitaAnimationTimeout = setTimeout(r, 800)));

            const latentImageSection = document.getElementById(
                "vitaLatentImageSection"
            );
            vitaUpdatePhase("Latent images", latentImageSection);
            vitaShowSection("vitaLatentImageSection");

            const latentImageContainer =
                document.getElementById("vitaLatentImages");
            await vitaMoveParticlesToContainer(
                latentImageContainer,
                "vita-latent-image-particle"
            );

            await new Promise((r) => (vitaAnimationTimeout = setTimeout(r, 600)));

            // --- Start: 平移 to Flow Start ---
            const flowSection = document.getElementById("vitaFlowSection");
            vitaUpdatePhase("Flowing from latent images to latent action...", flowSection);
            vitaShowSection("vitaFlowSection");
            const flowPipeline = document.getElementById("vitaFlowPipeline");
            const pipelineRect = flowPipeline.getBoundingClientRect();
            const latentImageRect = latentImageContainer.getBoundingClientRect();
            const stageRect = document
                .querySelector(".vita-stage")
                .getBoundingClientRect();

            const xOffset =
                pipelineRect.left -
                latentImageRect.left +
                (pipelineRect.width * 0.1 - latentImageRect.width / 2);

            vitaAllParticles.forEach((particle) => {
                const currentX = particle.offsetLeft;
                const currentY = particle.offsetTop;
                const targetX = currentX + xOffset;

                particle.dataset.initialX = targetX;
                particle.style.left = targetX + "px";
                particle.style.top = currentY + "px"; // Keep Y the same
                particle.className = "vita-particle vita-flow-particle";
            });
            await new Promise((r) => (vitaAnimationTimeout = setTimeout(r, 800)));
            // --- End: 平移 to Flow Start ---

            await vitaAnimateFlowMatching();

            const latentActionSection = document.getElementById(
                "vitaLatentActionSection"
            );
            vitaUpdatePhase("Latent actions", latentActionSection);
            vitaShowSection("vitaLatentActionSection");

            await new Promise((r) => (vitaAnimationTimeout = setTimeout(r, 400)));

            await vitaMoveParticlesToContainer(
                document.getElementById("vitaLatentActions"),
                "vita-latent-action-particle",
                true // Preserve layout
            );

            await new Promise((r) => (vitaAnimationTimeout = setTimeout(r, 600)));

            const actionSection = document.getElementById("vitaActionSection");
            vitaUpdatePhase("Decoding to action sequence", actionSection);
            vitaShowSection("vitaActionSection");
            await new Promise((r) => (vitaAnimationTimeout = setTimeout(r, 400)));

            vitaClearFlowParticles();
            vitaCreateActionSequence();
            await new Promise((r) => (vitaAnimationTimeout = setTimeout(r, 2000)));

            vitaUpdatePhase("Actions Generated", null);
            document.querySelector(".vita-control-button").textContent = "Restart";
            vitaIsAnimating = false;
        }

        document.addEventListener("DOMContentLoaded", () => {
            vitaAnimationTimeout = setTimeout(startVitaAnimation, 1500);
        });
    </script>
</body>

</html>