Update README.md
Browse files
README.md
CHANGED
|
@@ -122,6 +122,75 @@ VocalNet-8B was evaluated on [OpenAudioBench](https://huggingface.co/datasets/ba
|
|
| 122 |
<th style="padding: 10px; border: 1px solid #ddd;">Web Questions</th>
|
| 123 |
</tr>
|
| 124 |
</thead>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
<tbody>
|
| 126 |
<tr>
|
| 127 |
<td colspan="7" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
|
@@ -298,6 +367,61 @@ VocalNet-8B was evaluated on [OpenAudioBench](https://huggingface.co/datasets/ba
|
|
| 298 |
<td style="padding: 10px; border: 1px solid #ddd;">WER</td>
|
| 299 |
<td style="padding: 10px; border: 1px solid #ddd;">UTMOS</td>
|
| 300 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
<tr>
|
| 302 |
<td colspan="11" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
| 303 |
</tr>
|
|
|
|
| 122 |
<th style="padding: 10px; border: 1px solid #ddd;">Web Questions</th>
|
| 123 |
</tr>
|
| 124 |
</thead>
|
| 125 |
+
<tbody>
|
| 126 |
+
<tr>
|
| 127 |
+
<td colspan="7" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Tiny Models</td>
|
| 128 |
+
</tr>
|
| 129 |
+
<tr>
|
| 130 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">Mini-Omni</td>
|
| 131 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">0.5B</td>
|
| 132 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
| 133 |
+
<td style="padding: 10px; border: 1px solid #ddd;">1.84</td>
|
| 134 |
+
<td style="padding: 10px; border: 1px solid #ddd;">2.7</td>
|
| 135 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.12</td>
|
| 136 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.22</td>
|
| 137 |
+
</tr>
|
| 138 |
+
<tr>
|
| 139 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
| 140 |
+
<td style="padding: 10px; border: 1px solid #ddd;">1.80</td>
|
| 141 |
+
<td style="padding: 10px; border: 1px solid #ddd;">2.7</td>
|
| 142 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.08</td>
|
| 143 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.20</td>
|
| 144 |
+
</tr>
|
| 145 |
+
<tr>
|
| 146 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">SLAM-Omni</td>
|
| 147 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">0.5B</td>
|
| 148 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
| 149 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.50</td>
|
| 150 |
+
<td style="padding: 10px; border: 1px solid #ddd;">29.4</td>
|
| 151 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.39</td>
|
| 152 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.84</td>
|
| 153 |
+
</tr>
|
| 154 |
+
<tr>
|
| 155 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
| 156 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.01</td>
|
| 157 |
+
<td style="padding: 10px; border: 1px solid #ddd;">26.7</td>
|
| 158 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.34</td>
|
| 159 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.69</td>
|
| 160 |
+
</tr>
|
| 161 |
+
<tr>
|
| 162 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B (VA)</td>
|
| 163 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">1B</td>
|
| 164 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
| 165 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.38</td>
|
| 166 |
+
<td style="padding: 10px; border: 1px solid #ddd;">70.3</td>
|
| 167 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.38</td>
|
| 168 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.93</td>
|
| 169 |
+
</tr>
|
| 170 |
+
<tr>
|
| 171 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
| 172 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.83</td>
|
| 173 |
+
<td style="padding: 10px; border: 1px solid #ddd;">61.0</td>
|
| 174 |
+
<td style="padding: 10px; border: 1px solid #ddd;">2.78</td>
|
| 175 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.47</td>
|
| 176 |
+
</tr>
|
| 177 |
+
<tr>
|
| 178 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B</td>
|
| 179 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">1B</td>
|
| 180 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
| 181 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.79</b></td>
|
| 182 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>71.7</b></td>
|
| 183 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.60</b></td>
|
| 184 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.16</b></td>
|
| 185 |
+
</tr>
|
| 186 |
+
<tr>
|
| 187 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
| 188 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.03</b></td>
|
| 189 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>63.7</b></td>
|
| 190 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.06</b></td>
|
| 191 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.68</b></td>
|
| 192 |
+
</tr>
|
| 193 |
+
</tbody>
|
| 194 |
<tbody>
|
| 195 |
<tr>
|
| 196 |
<td colspan="7" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
|
|
|
| 367 |
<td style="padding: 10px; border: 1px solid #ddd;">WER</td>
|
| 368 |
<td style="padding: 10px; border: 1px solid #ddd;">UTMOS</td>
|
| 369 |
</tr>
|
| 370 |
+
<tr>
|
| 371 |
+
<td colspan="11" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Tiny Models</td>
|
| 372 |
+
</tr>
|
| 373 |
+
<tr>
|
| 374 |
+
<td style="padding: 10px; border: 1px solid #ddd;">Mini-Omni</td>
|
| 375 |
+
<td style="padding: 10px; border: 1px solid #ddd;">20.78</td>
|
| 376 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.429</td>
|
| 377 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.20</td>
|
| 378 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.428</td>
|
| 379 |
+
<td style="padding: 10px; border: 1px solid #ddd;">7.43</td>
|
| 380 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.428</td>
|
| 381 |
+
<td style="padding: 10px; border: 1px solid #ddd;">8.51</td>
|
| 382 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.433</td>
|
| 383 |
+
<td style="padding: 10px; border: 1px solid #ddd;">8.66</td>
|
| 384 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.430</td>
|
| 385 |
+
</tr>
|
| 386 |
+
<tr>
|
| 387 |
+
<td style="padding: 10px; border: 1px solid #ddd;">SLAM-Omni</td>
|
| 388 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.52</td>
|
| 389 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.439</td>
|
| 390 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.55</td>
|
| 391 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.467</td>
|
| 392 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.16</td>
|
| 393 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.470</td>
|
| 394 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.50</td>
|
| 395 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.461</td>
|
| 396 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.17</td>
|
| 397 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.464</td>
|
| 398 |
+
</tr>
|
| 399 |
+
<tr>
|
| 400 |
+
<td style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B (VA)</td>
|
| 401 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.43</b></td>
|
| 402 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.495</b></td>
|
| 403 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.65</td>
|
| 404 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.498</b></td>
|
| 405 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.97</b></td>
|
| 406 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.499</b></td>
|
| 407 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.40</td>
|
| 408 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.489</td>
|
| 409 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.66</td>
|
| 410 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.495</b></td>
|
| 411 |
+
</tr>
|
| 412 |
+
<tr>
|
| 413 |
+
<td style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B</td>
|
| 414 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.43</b></td>
|
| 415 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.491</td>
|
| 416 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.27</b></td>
|
| 417 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.497</td>
|
| 418 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.73</td>
|
| 419 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.486</td>
|
| 420 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.88</b></td>
|
| 421 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.493</b></td>
|
| 422 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.31</b></td>
|
| 423 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.491</td>
|
| 424 |
+
</tr>
|
| 425 |
<tr>
|
| 426 |
<td colspan="11" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
| 427 |
</tr>
|