metadata
language:
- en
- zh
- de
- es
- ru
- ko
- fr
- ja
- pt
- tr
- pl
- ca
- nl
- ar
- sv
- it
- id
- hi
- fi
- vi
- he
- uk
- el
- ms
- cs
- ro
- da
- hu
- ta
- 'no'
- th
- ur
- hr
- bg
- lt
- la
- mi
- ml
- cy
- sk
- te
- fa
- lv
- bn
- sr
- az
- sl
- kn
- et
- mk
- br
- eu
- is
- hy
- ne
- mn
- bs
- kk
- sq
- sw
- gl
- mr
- pa
- si
- km
- sn
- yo
- so
- af
- oc
- ka
- be
- tg
- sd
- gu
- am
- yi
- lo
- uz
- fo
- ht
- ps
- tk
- nn
- mt
- sa
- lb
- my
- bo
- tl
- mg
- as
- tt
- haw
- ln
- ha
- ba
- jw
- su
tags:
- audio
- automatic-speech-recognition
- hf-asr-leaderboard
pipeline_tag: automatic-speech-recognition
license: apache-2.0
license_link: https://choosealicense.com/licenses/apache-2.0/
whisper-large-v3-fp16-ov
- Model creator: OpenAI
- Original model: whisper-large-v3
Description
This is whisper-large-v3 model converted to the OpenVINO™ IR (Intermediate Representation) format with weights compressed to FP16.
Compatibility
The provided OpenVINO™ IR model is compatible with:
- OpenVINO version 2025.2.0 and higher
- Optimum Intel 1.23.0 and higher
optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int8 --disable-stateful whisper-large-v3-turbo-int8-ov
```python
#!/usr/bin/env python3
import time
import requests
import openvino_genai
import librosa
from pathlib import Path
from huggingface_hub import snapshot_download
def download_model(model_id="FluidInference/whisper-large-v3-turbo-int8-ov-npu"):
"""Download model from HuggingFace Hub"""
local_dir = Path("models") / model_id.split("/")[-1]
if local_dir.exists() and any(local_dir.iterdir()):
return str(local_dir)
print(f"Downloading model...")
snapshot_download(
repo_id=model_id,
local_dir=str(local_dir),
local_dir_use_symlinks=False
)
return str(local_dir)
def download_hf_audio_samples():
"""Download audio samples from Hugging Face"""
samples_dir = Path("sample_audios")
samples_dir.mkdir(exist_ok=True)
downloaded = []
whisper_samples = [
("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"),
("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"),
]
for url, filename in whisper_samples:
filepath = samples_dir / filename
if filepath.exists():
downloaded.append(str(filepath))
continue
try:
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
with open(filepath, 'wb') as f:
f.write(response.content)
downloaded.append(str(filepath))
except Exception as e:
print(f"Error downloading {filename}: {e}")
return downloaded
def read_audio(filepath):
"""Read audio file and convert to 16kHz"""
try:
raw_speech, _ = librosa.load(filepath, sr=16000)
return raw_speech.tolist()
except Exception as e:
print(f"Error reading {filepath}: {e}")
return None
def test_whisper_on_file(pipe, filepath):
"""Test Whisper on a single audio file"""
config = pipe.get_generation_config()
config.language = "<|en|>"
config.task = "transcribe"
config.return_timestamps = True
config.max_new_tokens = 448
raw_speech = read_audio(filepath)
if raw_speech is None:
return None
duration = len(raw_speech) / 16000
start_time = time.time()
result = pipe.generate(raw_speech, config)
inference_time = time.time() - start_time
return {
"file": filepath,
"duration": duration,
"inference_time": inference_time,
"rtf": inference_time/duration,
"transcription": str(result)
}
def main():
# Download model
model_path = download_model()
# Initialize pipeline on NPU
print(f"\nInitializing NPU...")
start_time = time.time()
pipe = openvino_genai.WhisperPipeline(model_path, "NPU")
init_time = time.time() - start_time
results = []
# Collect test files
test_files = []
test_files.extend(Path(".").glob("*.wav"))
if Path("samples/c/whisper_speech_recognition").exists():
test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav"))
# Download HF samples
hf_samples = download_hf_audio_samples()
test_files.extend([Path(f) for f in hf_samples])
# Test all files
print(f"\nTesting {len(test_files)} files...")
for audio_file in test_files:
result = test_whisper_on_file(pipe, str(audio_file))
if result:
results.append(result)
print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x")
# Print summary
if results:
total_duration = sum(r["duration"] for r in results)
total_inference = sum(r["inference_time"] for r in results)
avg_rtf = total_inference / total_duration
print(f"\n{'='*50}")
print(f"NPU Performance Summary")
print(f"{'='*50}")
print(f"Model load time: {init_time:.1f}s")
print(f"Files tested: {len(results)}")
print(f"Total audio: {total_duration:.1f}s")
print(f"Total inference: {total_inference:.1f}s")
print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}")
print(f"\nResults:")
for r in results:
trans = r['transcription'].strip()
if len(trans) > 60:
trans = trans[:57] + "..."
print(f"- {Path(r['file']).name}: \"{trans}\"")
if __name__ == "__main__":
main()