Update README.md
Browse files
README.md
CHANGED
|
@@ -166,6 +166,117 @@ for text_chunk in asr.transcribe_file_streaming(args.audio_path, config):
|
|
| 166 |
```
|
| 167 |
</details>
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
### Inference on GPU
|
| 170 |
To perform inference on the GPU, add `run_opts={"device":"cuda"}` when calling the `from_hparams` method.
|
| 171 |
|
|
|
|
| 166 |
```
|
| 167 |
</details>
|
| 168 |
|
| 169 |
+
<details>
|
| 170 |
+
<summary>Live ASR decoding from a browser using Gradio</summary>
|
| 171 |
+
|
| 172 |
+
This is a simple hacky demo of live ASR in the browser using Gradio's live microphone streaming feature.
|
| 173 |
+
If you run this, please note that browsers may refuse to stream audio from an insecure connection, unless it is localhost.
|
| 174 |
+
If you are running this on a remote server, you could use SSH port forwarding to expose the remote's port on your machine.
|
| 175 |
+
|
| 176 |
+
Run using:
|
| 177 |
+
|
| 178 |
+
`python3 gradio-asr.py --model-source speechbrain/asr-streaming-conformer-librispeech --ip=localhost --device=cpu`
|
| 179 |
+
|
| 180 |
+
```python
|
| 181 |
+
from argparse import ArgumentParser
|
| 182 |
+
from dataclasses import dataclass
|
| 183 |
+
import logging
|
| 184 |
+
|
| 185 |
+
parser = ArgumentParser()
|
| 186 |
+
parser.add_argument("--model-source", required=True)
|
| 187 |
+
parser.add_argument("--device", default="cpu")
|
| 188 |
+
parser.add_argument("--ip", default="127.0.0.1")
|
| 189 |
+
parser.add_argument("--port", default=9431)
|
| 190 |
+
parser.add_argument("--chunk-size", default=24, type=int)
|
| 191 |
+
parser.add_argument("--left-context-chunks", default=4, type=int)
|
| 192 |
+
parser.add_argument("--num-threads", default=None, type=int)
|
| 193 |
+
parser.add_argument("--verbose", "-v", default=False, action="store_true")
|
| 194 |
+
args = parser.parse_args()
|
| 195 |
+
|
| 196 |
+
if args.verbose:
|
| 197 |
+
logging.getLogger().setLevel(logging.INFO)
|
| 198 |
+
|
| 199 |
+
logging.info("Loading libraries")
|
| 200 |
+
|
| 201 |
+
from speechbrain.inference.ASR import StreamingASR, ASRStreamingContext
|
| 202 |
+
from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig
|
| 203 |
+
import torch
|
| 204 |
+
import gradio as gr
|
| 205 |
+
import torchaudio
|
| 206 |
+
import numpy as np
|
| 207 |
+
|
| 208 |
+
device = args.device
|
| 209 |
+
|
| 210 |
+
if args.num_threads is not None:
|
| 211 |
+
torch.set_num_threads(args.num_threads)
|
| 212 |
+
|
| 213 |
+
logging.info(f"Loading model from \"{args.model_source}\" onto device {device}")
|
| 214 |
+
|
| 215 |
+
asr = StreamingASR.from_hparams(args.model_source, run_opts={"device": device})
|
| 216 |
+
config = DynChunkTrainConfig(args.chunk_size, args.left_context_chunks)
|
| 217 |
+
|
| 218 |
+
@dataclass
|
| 219 |
+
class GradioStreamingContext:
|
| 220 |
+
context: ASRStreamingContext
|
| 221 |
+
chunk_size: int
|
| 222 |
+
waveform_buffer: torch.Tensor
|
| 223 |
+
decoded_text: str
|
| 224 |
+
|
| 225 |
+
def transcribe(stream, new_chunk):
|
| 226 |
+
sr, y = new_chunk
|
| 227 |
+
|
| 228 |
+
y = y.astype(np.float32)
|
| 229 |
+
y = torch.tensor(y, dtype=torch.float32, device=device)
|
| 230 |
+
y /= max(1, torch.max(torch.abs(y)).item()) # norm by max abs() within chunk & avoid NaN
|
| 231 |
+
if len(y.shape) > 1:
|
| 232 |
+
y = torch.mean(y, dim=1) # downmix to mono
|
| 233 |
+
|
| 234 |
+
# HACK: we are making poor use of the resampler across chunk boundaries
|
| 235 |
+
# which may degrade accuracy.
|
| 236 |
+
# NOTE: we should also absolutely avoid recreating a resampler every time
|
| 237 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=asr.audio_normalizer.sample_rate)
|
| 238 |
+
y = resampler(y) # janky resample (probably to 16kHz)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
if stream is None:
|
| 242 |
+
stream = GradioStreamingContext(
|
| 243 |
+
context=asr.make_streaming_context(config),
|
| 244 |
+
chunk_size=asr.get_chunk_size_frames(config),
|
| 245 |
+
waveform_buffer=y,
|
| 246 |
+
decoded_text="",
|
| 247 |
+
)
|
| 248 |
+
else:
|
| 249 |
+
stream.waveform_buffer = torch.concat((stream.waveform_buffer, y))
|
| 250 |
+
|
| 251 |
+
while stream.waveform_buffer.size(0) > stream.chunk_size:
|
| 252 |
+
chunk = stream.waveform_buffer[:stream.chunk_size]
|
| 253 |
+
stream.waveform_buffer = stream.waveform_buffer[stream.chunk_size:]
|
| 254 |
+
|
| 255 |
+
# fake batch dim
|
| 256 |
+
chunk = chunk.unsqueeze(0)
|
| 257 |
+
|
| 258 |
+
# list of transcribed strings, of size 1 because the batch size is 1
|
| 259 |
+
with torch.no_grad():
|
| 260 |
+
transcribed = asr.transcribe_chunk(stream.context, chunk)
|
| 261 |
+
stream.decoded_text += transcribed[0]
|
| 262 |
+
|
| 263 |
+
return stream, stream.decoded_text
|
| 264 |
+
|
| 265 |
+
# NOTE: latency seems relatively high, which may be due to this:
|
| 266 |
+
# https://github.com/gradio-app/gradio/issues/6526
|
| 267 |
+
|
| 268 |
+
demo = gr.Interface(
|
| 269 |
+
transcribe,
|
| 270 |
+
["state", gr.Audio(sources=["microphone"], streaming=True)],
|
| 271 |
+
["state", "text"],
|
| 272 |
+
live=True,
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
demo.launch(server_name=args.ip, server_port=args.port)
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
</details>
|
| 279 |
+
|
| 280 |
### Inference on GPU
|
| 281 |
To perform inference on the GPU, add `run_opts={"device":"cuda"}` when calling the `from_hparams` method.
|
| 282 |
|