
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="a bimodal motion-language model that treats human motion as a second modality, decoupling motion modeling via separate model parameters and enabling both effective cross-modal interaction and efficient multimodal scaling training.">
  <meta name="keywords" content="Motion generation, multi-modal understanding and generation">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>MotionGPT3: Human Motion as a Second Modality</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>
  <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML"></script>


  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.svg">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>


  <nav class="navbar" role="navigation" aria-label="main navigation">
    <div class="navbar-brand">
      <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
      </a>
    </div>
    <div class="navbar-menu">
      <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
  
        <div class="navbar-item has-dropdown is-hoverable">
          <a class="navbar-link" style="font-size: 1.3rem;">
            More Research
          </a>
          <div class="navbar-dropdown">
          </div>
        </div>
      </div>
  
    </div>
  </nav>
  

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">MotionGPT3: Human Motion as a Second Modality</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="">Anonymous authors</a></span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="./static/motiongpt3_paper.pdf"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span> 
                  <span>Paper</span>
                </a>
              </span>
              <span class="link-block">
                <a href="./static/motiongpt3_supp.pdf"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span> 
                  <span>Supp.</span>
                </a>
              </span>
              <!-- Video Link. -->
              <span class="link-block">
                <a href="#video"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-youtube"></i>
                  </span>
                  <span>Video</span>
                </a>
              </span>
            </div>

        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <div class="columns is-centered">
      <!-- <figure id="teaser" autoplay muted loop playsinline width="100%"> -->
        <!-- <img src="./static/figures/teaser.png" width="1000px" > -->
        <!-- <img src="./static/figures/teaser.png"> -->
        <video poster="" id="teaser" autoplay controls muted playsinline height="100%">
          <source src="./static/videos/teaser.mp4"
                  type="video/mp4">
          <style>
            video::-webkit-media-controls-enclosure {
            display: none;
            }
          </style>
        </video>
        <!-- height="200px" -->
        <!-- <a href="./teaser_image.png"><img src = "./teaser_image.png" height="300px"></img></href></a><br> -->
      <!-- </figure> -->
      </div>
      <h2 class="subtitle has-text-centered">
        <span class="mgpt3">MotionGPT3</span> is a hybrid motion-language model designed to process arbitrary input sequence of motion or language and generate outputs in either modality.
      </h2>
    </div>
  </div>
</section>



<!-- text-to-motion results here -->
<section class="hero is-light is-small">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <!-- 选择栏 -->
      <div class="tabs is-toggle is-centered">
        <ul>
          <li class="is-active" data-target="text-to-motion">
            <a>Text-to-Motion</a>
          </li>
          <li data-target="motion-to-text">
            <a>Motion-to-Text</a>
          </li>
        </ul>
      </div>

      <!-- <h2 class="title is-4 has-text-centered" >Text-to-Motion</h2> -->
    <div class="carousel-wrapper" id="text-to-motion">
      <h2 class="title is-4" >Text-to-Motion</h2>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-3 has-text-centered">
          <div class="caption "> A person is crouched down and walking around sneakily.   </div>
          <video poster="" id="steve" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/3_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-4 has-text-centered">
          <div class="caption "> A man swimming using both hands together while kicking feet in unison.  </div>
          <video poster="" id="chair-tp" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/4_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-5 has-text-centered">
          <div class="caption "> Person sits on the ledge of something then gets off and walks away.  </div>
          <video poster="" id="shiba" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/5_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-23 has-text-centered">
          <div class="caption "> A person stands still with both arms raised at shoulder height. </div>
          <video poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/23_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-24 has-text-centered">
          <div class="caption "> The person jumps over something and lands on feet. </div>
          <video poster="" id="blueshirt" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/24_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-26 has-text-centered">
          <div class="caption "> A figure steps backward cockily, swinging their arms. </div>
          <video poster="" id="mask" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/26_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-27 has-text-centered">
          <div class="caption "> He is running in straight line then jumps over something and continues running. </div>
          <video poster="" id="coffee" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/27_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-28 has-text-centered">
          <div class="caption "> a person walks forward with their right leg limping.  </div>
          <video poster="" id="toby" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/28_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-29 has-text-centered">
          <div class="caption "> A person walks forward using left hand to steady themselves on an object.  </div>
          <video poster="" id="toby" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/t2m/29_out_mesh.mp4"
                    type="video/mp4">
          </video>
        </div>
      </div>
    </div>
    <!-- </div>
  </div>
</section> -->


<!-- motion-to-text results here -->
<!-- <section class="hero is-light is-small">
  <div class="hero-body">
    <div class="container"> -->
      <!-- <h2 class="title is-4 has-text-centered" >Text-to-Motion</h2> -->
    <div class="carousel-wrapper" id="motion-to-text">
      <h2 class="title is-4" >Motion-to-Text</h2>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item has-text-centered">
          <video class="no-border" poster="" id="steve" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/m2t/example0.mp4"
                    type="video/mp4">
          </video>
          <div class="caption1 ">"The person is walking forward and loses their balance on one foot but they continue to walk behind."</div>
        </div>
        <div class="item has-text-centered">
          <video class="no-border" poster="" id="chair-tp" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/m2t/example1.mp4"
                    type="video/mp4">
          </video>
          <div class="caption1 ">"A person walks to the right, then walks to the left, then returns to their starting position."</div>
        </div>
        <div class="item has-text-centered">
          <video class="no-border" poster="" id="shiba" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/m2t/example2.mp4"
                    type="video/mp4">
          </video>
          <div class="caption1 ">"A person brings the ball back to his throwing motion and throws again."</div>
        </div>
        <div class="item has-text-centered">
          <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/m2t/example3.mp4"
                    type="video/mp4">
          </video>
          <div class="caption1 ">"A person takes a few steps forward, then does a cartwheel diagonally to the right then takes a step back."</div>
        </div>
        <div class="item has-text-centered">
          <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/m2t/example4.mp4"
                    type="video/mp4">
          </video>
          <div class="caption1 ">"A person is shaking his hand."</div>
        </div>
        <div class="item has-text-centered">
            <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
              <source src="./static/videos/m2t/example5.mp4"
                      type="video/mp4">
            </video>
            <div class="caption1 ">"A person turns around, picks up what is moving and leans down to pick it up."</div>
        </div>
        <div class="item has-text-centered">
            <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
              <source src="./static/videos/m2t/example6.mp4"
                      type="video/mp4">
            </video>
            <div class="caption1 ">"A person holds their arms in front of them and does four squats."</div>
        </div>
        <div class="item has-text-centered">
            <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
              <source src="./static/videos/m2t/example7.mp4"
                      type="video/mp4">
            </video>
            <div class="caption1 ">"A person raises their arms around themselves and it and then motions them violently."</div>
        </div>
        <div class="item has-text-centered">
            <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
              <source src="./static/videos/m2t/example8.mp4"
                      type="video/mp4">
            </video>
            <div class="caption1 ">"A person lifts something, tilting the other person and places it back down."</div>
        </div>
        <div class="item has-text-centered">
            <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
              <source src="./static/videos/m2t/example9.mp4"
                      type="video/mp4">
            </video>
            <div class="caption1 ">"A person walks up steps, starts upstairs and then the stairs back down."</div>
        </div>
        <div class="item has-text-centered">
            <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
              <source src="./static/videos/m2t/example10.mp4"
                      type="video/mp4">
            </video>
            <div class="caption1 ">"A figure walks forward, picks something up and puts it on the table."</div>
        </div>
        <div class="item has-text-centered">
            <video class="no-border" poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
              <source src="./static/videos/m2t/example11.mp4"
                      type="video/mp4">
            </video>
            <div class="caption1 ">"A person lifts something, tilting the other person or places it back down."</div>
        </div>

      </div>
    </div>
  </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
        <!-- <div class="column is-full"> -->
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
        <p>
          With the rapid progress of large language models (LLMs), multimodal frameworks that unify understanding and generation have become promising, yet they face increasing complexity as the number of modalities and tasks grows. 
          We observe that motion quantization introduces approximation errors that cap motion quality, and that unifying discrete text and continuous motion within a single-stream backbone amplifies cross-modal interference. 
        </p>
        <p>
          Motivated by recent multi-branch Transformer designs that separate signals from different modalities, we propose MotionGPT3, a bimodal motion-language model for both understanding and generation. 
          MotionGPT3 encodes raw motion into a continuous latent space using a variational autoencoder (VAE), thereby avoiding quantization-induced artifacts, while leveraging the semantic prior of pretrained language models.
          A dual-stream Transformer with shared attention preserves modality-specific routes while enabling controlled, bidirectional information flow, which reduces interference, stabilizing optimization, and empirically accelerates convergence without degrading fidelity.          For multimodal joint training, a generate-then-align three-stage schedule further improves stability and limits cross-task interference.
        </p>
        <p>
          Experiments show that MotionGPT3 achieves 2× faster convergence in training loss and up to 4× faster convergence in validation, while maintaining state-of-the-art performance on standard motion understanding and motion generation benchmarks.
<!-- 
        <p>
            Though recent advances in multimodal models have demonstrated strong capabilities and opportunities in unified understanding and generation, the development of unified motion-language models remains underexplored. To enable such models with high-fidelity human motion, two core challenges must be addressed. The first is the reconstruction gap between the continuous motion modality and discrete representation in an autoregressive manner, and the second is the degradation of language intelligence during unified training.
        </p>
            Inspired by the mixture of experts, we propose MotionGPT3, a bimodal motion-language model that treats human motion as a second modality, decoupling motion modeling via separate model parameters and enabling both effective cross-modal interaction and efficient multimodal scaling training.
            To preserve language intelligence, the text branch retains the original structure and parameters of the pretrained language model, while a new motion branch is integrated via a shared attention mechanism, enabling bidirectional information flow between two modalities.
        </p>
        <p>
            We first employ a motion Variational Autoencoder (VAE) to encode raw human motion into latent representations.
            Based on this continuous latent space, the motion branch predicts motion latents directly from intermediate hidden states using a diffusion head, bypassing discrete tokenization.
            Extensive experiments show that our approach achieves competitive performance on both motion understanding and generation tasks while preserving strong language capabilities, establishing a unified bimodal motion diffusion framework within an autoregressive manner.
        </p> -->
        </div>
      </div>
    </div>
    <!--/ Abstract. -->

  </div>
</section>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h2 class="title is-3 publication-title">Our method</h2>
          <!-- <img src="./static/figures/pipeline.png" width="800px" >
          <p>
            We propose a bimodal motion-language framework to process language and motion with their owm expert 
            and represent motion in an VAE latent space, supervise the predicted motion latents with a diffusion head. 
          </p>
        </div>
        <div class="column has-text-centered">
          <h1 class="title is-3 publication-title">Training Scheme</h1> -->
          <img src="./static/figures/training.png" style="display:block; margin:0 auto; max-width:90%;">
          <p>
            We introduce a three-stage alignment for our hybrid motion-language model. 
            First the model learn to generate motion properly. 
            Then we further align the motion branch with language by introducing motion reasoning. 
            Finally, we fine-tune the model by joint training with unfrozen text modules. 
          </p>
        </div>
      </div>
    </div>
  </div>
</section>



<section class="section" id="experiments">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full-width">

        <h1 class="title is-3 has-text-centered">Experiments</h1>

        <!-- Key findings -->
        <div class="notification is-light" role="note" aria-label="Key findings in experiments">
          <ul style="margin-left:2rem; list-style: disc;">
            <li><strong>2× faster</strong> training convergence vs. discrete & unified baselines while maintaining or improving quality.</li>
            <li>State-of-the-art <code>R@1</code>/<code>R@3</code> on HumanML3D text-to-motion; lower <code>MMDist</code><span aria-label="lower is better">↓</span>.</li>
            <li>Cross-Modal Attention helps in a non-monotonic pattern, with last <code>L</code> layers enabled.</li>
            <li>Competitive results with a <strong>smaller motion branch</strong> (~51M params) and modest text backbones (124M).</li>
          </ul>
        </div>

        <!-- Main results: two-column layout on desktop -->
        <div class="columns is-multiline">
          <!-- Training speed -->
           <div class="column is-12-tablet">
            <h2 class="title is-4">Training Speed</h2>
            <figure class="image" aria-describedby="exp-train-desc">
              <img src="./static/figures/training speed.png" alt="Line charts comparing training speed and quality; our method converges about 2× faster with better R@k and lower MMDist." 
              style="display:block; margin:0 auto; max-width:90%;">
              <figcaption id="exp-train-desc" class="has-text-grey is-size-6">
                Continuous motion + bimodal architecture accelerates training by ~2× vs. discrete/unified variants while improving quality.
              </figcaption>
            </figure>
          </div>

          <!-- T2M comparison -->
          <div class="column is-12-tablet">
            <h2 class="title is-4">Text-to-Motion Comparison</h2>
            <figure class="image has-text-centered" aria-describedby="exp-t2m-desc">
              <img src="./static/figures/all_unified_t2m.png" alt="Grouped bar charts of R@1, R@3, and MMDist across recent unified methods, with our model leading."
              style="display:block; margin:0 auto; max-width:80%;">
              <figcaption id="exp-t2m-desc" class="has-text-grey is-size-6">
                Evaluated on HumanML3D test split. Metrics: <code>R@1</code>/<code>R@3</code> (higher is better) and <code>MMDist</code><span aria-label="lower is better">↓</span> (lower is better).
              </figcaption>
            </figure>
          </div>

          <!-- Ablations (collapsible) -->
          <div class="column is-12">
            <h2 class="title is-4">Ablations on Bimodal Architecture</h2>

            <details open>
              <summary class="is-size-5 has-text-weight-semibold">a) Cross-Modal Attention</summary>
              <figure class="image" aria-describedby="exp-cma-desc" style="margin-top:0.75rem;">
                <img src="./static/figures/ablation_lastL_curve.png" alt="Curves of R@1/R@3 and MMDist as the number of last layers with cross-modal attention increases." style="width:78%; margin:auto;">
                <figcaption id="exp-cma-desc" class="has-text-grey is-size-6">
                  CMA enabled in the last <code>L</code> layers (<code>L ∈ {1,…,6}</code>). Performance improves up to <code>L=5</code>, then slightly degrades at <code>L=6</code>, indicating a <em>non-monotonic</em> pattern.
                </figcaption>
              </figure>
            </details>

            <details open>
              <summary class="is-size-5 has-text-weight-semibold">b) Model Params</summary>
              <figure class="image" aria-describedby="exp-param-desc" style="margin-top:0.75rem;">
                <img src="./static/figures/mot_model_params.png" alt="Scatter/line charts relating text and motion branch parameters to R@k and MMDist on motion generation." style="width:90%; margin:auto;">
                <figcaption id="exp-param-desc" class="has-text-grey is-size-6">
                  All models trained for 200K iterations. A 124M text branch is competitive with 355M/774M, and the motion branch achieves strong results with ~51M params (halved vs. larger variants).
                </figcaption>
              </figure>
            </details>

          </div>
        </div>

      </div>
    </div>
  </div>
</section>




<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-full-width">
    <!-- Paper video. -->
    &nbsp;
    <h1 class="title is-3 has-text-centered">Video</h1>
    <div class="columns is-centered has-text-centered">
        <!-- <div class="column is-four-fifths"> -->
      <div class="column is-full">
        <!-- <h2 class="title is-3">Video</h2> -->
        <div class="publication-video" id="video">
          <!-- <iframe src="static/videos/MotionGPT3_Video.mp4"
                  frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe> -->
                  <video controls muted poster="static/figures/video_cover.png">
                    <source src="static/videos/MotionGPT3_Video.mp4" type="video/mp4">
                  </video>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>
    </code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <a class="icon-link"
         href="./static/motiongpt3_paper.pdf">
        <i class="fas fa-file-pdf"></i>
    </div>
  </div>
</footer>

  
</body>
</html>
