<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>Emotion Control TTS Demo</title>
  <style>
    body {
      font-family: sans-serif;
      background-color: #f2f0fc;
      padding: 20px;
    }
    table {
      width: 100%;
      border-collapse: collapse;
      background-color: #ffffff;
    }
    th, td {
      border: 1px solid #ccc;
      padding: 12px;
      text-align: center;
    }
    th {
      background-color: #e8e4f8;
      text-align: center;
    }
    h2 {
      color: #4b3ca3;
    }
    audio {
      width: 120px;
    }
  </style>
</head>
<body>

<h1>T-VecTTS: Adding time-varying-emotion control to flow-matching-based TTS</h1>
<h2>submision number: 12680</h2>
<h1>EMO-Change</h1>
<p>A reference audio was constructed by concatenating two speech samples, each expressing a different emotion, to explicitly include multiple emotional cues within a single utterance.</p>
<table>
    <thead>
      <tr>
        <th rowspan="2">Emotion</th>
        <th rowspan="2">Index</th>
        <th rowspan="2">Audio&nbsp;prompt</th>
        <th colspan="5">Generated&nbsp;audio</th>
      </tr>
      <tr>
        <th>Voicebox</th>
        <th>ELaTE</th>
        <th>EmoCtrl-TTS</th>
        <th>F5-TTS</th>
        <th>Ours</th>
      </tr>
    </thead>
    <tbody>
      <!-- Angry → Calm -->
      <tr>
        <td rowspan="2">Angry → Calm</td>
        <td>(a)</td>
        <td><audio controls src="audio/emochange_ref/F_spk_02-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/F_spk_02-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/F_spk_02-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/F_spk_02-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/F_spk_02-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/F_spk_02-angrycalm.wav"></audio></td>
      </tr>
      <tr>
        <td>(b)</td>
        <td><audio controls src="audio/emochange_ref/M_spk_01-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/M_spk_01-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/M_spk_01-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/M_spk_01-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/M_spk_01-angrycalm.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/M_spk_01-angrycalm.wav"></audio></td>
      </tr>
  
      <!-- Sad → Surprised -->
      <tr>
        <td rowspan="2">Sad → Surprised</td>
        <td>(a)</td>
        <td><audio controls src="audio/emochange_ref/F_spk_20-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/F_spk_20-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/F_spk_20-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/F_spk_20-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/F_spk_20-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/F_spk_20-sadsurprised.wav"></audio></td>
      </tr>
      <tr>
        <td>(b)</td>
        <td><audio controls src="audio/emochange_ref/M_spk_01-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/M_spk_01-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/M_spk_01-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/M_spk_01-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/M_spk_01-sadsurprised.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/M_spk_01-sadsurprised.wav"></audio></td>
      </tr>
  
      <!-- Happy → Disgusted -->
      <tr>
        <td rowspan="2">Happy → Disgusted</td>
        <td>(a)</td>
        <td><audio controls src="audio/emochange_ref/F_spk_06-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/F_spk_06-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/F_spk_06-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/F_spk_06-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/F_spk_06-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/F_spk_06-happydisgusted.wav"></audio></td>
      </tr>
      <tr>
        <td>(b)</td>
        <td><audio controls src="audio/emochange_ref/M_spk_03-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/M_spk_03-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/M_spk_03-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/M_spk_03-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/M_spk_03-happydisgusted.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/M_spk_03-happydisgusted.wav"></audio></td>
      </tr>
  
      <!-- Calm → Fearful -->
      <tr>
        <td rowspan="2">Calm → Fearful</td>
        <td>(a)</td>
        <td><audio controls src="audio/emochange_ref/F_spk_22-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/F_spk_22-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/F_spk_22-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/F_spk_22-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/F_spk_22-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/F_spk_22-calmfearful.wav"></audio></td>
      </tr>
      <tr>
        <td>(b)</td>
        <td><audio controls src="audio/emochange_ref/M_spk_11-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/voicebox/M_spk_11-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/elate/M_spk_11-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/emoctrl/M_spk_11-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/f5-tts/M_spk_11-calmfearful.wav"></audio></td>
        <td><audio controls src="audio/emochange_results/ours/M_spk_11-calmfearful.wav"></audio></td>
      </tr>
    </tbody>
  </table>


  <h1>JVNV S2ST</h1>
  <p>Japanese-to-English speech-to-speech translation</p>
  
  <table id="audio-table">
    <thead>
      <tr>
        <th rowspan="2">Emotion</th>
        <th rowspan="2">Index</th>
        <th rowspan="2">Source audio (Japanese)</th>
        <th colspan="6">Translated audio (English)</th>
      </tr>
      <tr>
        <th>SeamlessExpressive</th>
        <th>Voicebox(*)</th>
        <th>ELaTE(*)</th>
        <th>EmoCtrl-TTS(*)</th>
        <th>F5-TTS(**)</th>
        <th>Ours(**)</th>
      </tr>
    </thead>
    <tbody id="audio-body"></tbody>
  </table>
  
  <p>(*): They share same backbone model (Voicebox)</p>
  <p>(**): They share same backbone model (F5-TTS)</p>
  
  <script>
  const emotions = ['happy', 'sad', 'angry', 'surprised', 'disgusted', 'fearful'];
  const speakers = ['F1', 'M1'];
  const speakerIndex = ['(a)', '(b)'];
  const models = ['seamless', 'voicebox', 'elate', 'emoctrl', 'f5-tts', 'ours'];
  
  const tbody = document.getElementById('audio-body');
  
  emotions.forEach(emotion => {
    speakers.forEach((spk, idx) => {
      const row = document.createElement('tr');
  
      if (idx === 0) {
        const emotionCell = document.createElement('td');
        emotionCell.textContent = emotion.charAt(0).toUpperCase() + emotion.slice(1);
        emotionCell.rowSpan = 2;
        row.appendChild(emotionCell);
      }
  
      // Index (a) or (b)
      const indexCell = document.createElement('td');
      indexCell.textContent = speakerIndex[idx];
      row.appendChild(indexCell);
  
      // Source audio
      const srcCell = document.createElement('td');
      const audio = document.createElement('audio');
      audio.controls = true;
      audio.src = `audio/jvnv_ref/JVNV_${spk}_${emotion}.wav`;
      srcCell.appendChild(audio);
      row.appendChild(srcCell);
  
      // Translated audio
      models.forEach(model => {
        const modelCell = document.createElement('td');
        const audio = document.createElement('audio');
        audio.controls = true;
        audio.src = `audio/jvnv_results/${model}/JVNV_${spk}_${emotion}.wav`;
        modelCell.appendChild(audio);
        row.appendChild(modelCell);
      });
  
      tbody.appendChild(row);
    });
  });
  </script>

</body>
</html>
