<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Audio samples of linguistic-speech regularization</title>
<style>
    audio { width: 200px; }
</style>

</head>
<body>
<center>
<br>
<h2>Audio samples of linguistic-speech regularization</h2>
<br>

<h3>1. List of speech samples with filled pauses (FPs)</h3>

<table border=1 width=700>
    <tr>
        <th>Method</th>
        <th>Explanation of FP-included speech samples</th>
        <th>Example</th>
    </tr>
    <tr align=center>
        <td>TrueFP</td>
        <td>Synthesized from text with ground-truth FPs (i.e., actually used)</td>
        <td style="padding-left:10px">I explain <font color=#0000FF>uh</font> a theory.</td>
    </tr>
    <tr align=center>
        <td>PredFP</td>
        <td>Synthesized from text with predicted FPs</td>
        <td style="padding-left:10px">I <font color=#0000FF>uh</font> explain a theory.</td>
    </tr>
</table>

<br><br>

<h3>2. List of models</h3>

<table border=1 width=600>
    <tr>
        <th>Model</th>
        <th>Explanation</th>
        <th>α</th>
        <th>β</th>
    </tr>
    <tr align=center>
        <td>Baseline</td>
        <td>Trained without regularization</td>
        <td>0.0</td>
        <td>--</td>
    </tr>
    <tr align=center>
        <td>Proposed</td>
        <td>Trained with regularization for probabilistically sampled FPs</td>
        <td>1.0</td>
        <td>4.0</td>
    </tr>
</table>

<br><br>

<h3>3. Audio samples </h3>

<table border=1 align=center width=750>
<tr>
    <th>Speaker</th>
    <th>Utterance</th>
    <th>Sample</th>
    <th>Ground-truth<br>(Natural speech)</th>
    <th>Baseline</th>
    <th>Proposed</th>
</tr>

<tr align=center>
    <td rowspan="6">A</td>
    <td rowspan="2">Sample1</td>
    <td>TrueFP</td>
    <td><audio controls="controls"><source src="./wav/ground_truth/utokyo_lecture_1-115.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/baseline/truefp/utokyo_lecture_1-115.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/truefp/utokyo_lecture_1-115.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_1-115.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_1-115.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <!-- <td>A</td> -->
    <td rowspan="2">Sample2</td>
    <td>TrueFP</td>
    <td><audio controls="controls" ><source src="./wav/ground_truth/utokyo_lecture_3-52.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/baseline/truefp/utokyo_lecture_3-52.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/proposed/truefp/utokyo_lecture_3-52.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_3-52.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_3-52.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <!-- <td>A</td> -->
    <td rowspan="2">Sample3</td>
    <td>TrueFP</td>
    <td><audio controls="controls" ><source src="./wav/ground_truth/utokyo_lecture_1-154.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/baseline/truefp/utokyo_lecture_1-154.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/proposed/truefp/utokyo_lecture_1-154.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_1-154.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_1-154.wav" autoplay/>x</audio></td>
</tr>

<!-- <tr align=center>
    <td rowspan="10">B</td>
    <td rowspan="2">Sample1</td>
    <td>TrueFP</td>
    <td><audio controls="controls" ><source src="./wav/ground_truth/utokyo_lecture_5-392.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/baseline/truefp/utokyo_lecture_5-392.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/proposed/truefp/utokyo_lecture_5-392.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_5-392.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_5-392.wav" autoplay/>x</audio></td>
</tr> -->

<tr align=center>
    <td rowspan="6">B</td>
    <td rowspan="2">Sample1</td>
    <td>TrueFP</td>
    <td><audio controls="controls" ><source src="./wav/ground_truth/utokyo_lecture_6-663.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/baseline/truefp/utokyo_lecture_6-663.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/proposed/truefp/utokyo_lecture_6-663.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_6-663.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_6-663.wav" autoplay/>x</audio></td>
</tr>

<!-- <tr align=center>
    <td>B</td>
    <td rowspan="2">Sample3</td>
    <td>TrueFP</td>
    <td><audio controls="controls" ><source src="./wav/ground_truth/utokyo_lecture_4-265.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/baseline/truefp/utokyo_lecture_4-265.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/proposed/truefp/utokyo_lecture_4-265.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_4-265.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_4-265.wav" autoplay/>x</audio></td>
</tr> -->

<tr align=center>
    <!-- <td>B</td> -->
    <td rowspan="2">Sample2</td>
    <td>TrueFP</td>
    <td><audio controls="controls" ><source src="./wav/ground_truth/utokyo_lecture_5-409.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/baseline/truefp/utokyo_lecture_5-409.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/proposed/truefp/utokyo_lecture_5-409.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_5-409.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_5-409.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <!-- <td>B</td> -->
    <td rowspan="2">Sample3</td>
    <td>TrueFP</td>
    <td><audio controls="controls" ><source src="./wav/ground_truth/utokyo_lecture_5-432.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/baseline/truefp/utokyo_lecture_5-432.wav" autoplay/>x</audio></td>
    <td><audio controls="controls" ><source src="./wav/proposed/truefp/utokyo_lecture_5-432.wav" autoplay/>x</audio></td>
</tr>

<tr align=center>
    <td>PredFP</td>
    <td>--</td>
    <td><audio controls="controls"><source src="./wav/baseline/predfp/utokyo_lecture_5-432.wav" autoplay/>x</audio></td>
    <td><audio controls="controls"><source src="./wav/proposed/predfp/utokyo_lecture_5-432.wav" autoplay/>x</audio></td>
</tr>

</table>

<br><br>

</center>
</body>
</html>
