FluidInference
/

whisper-large-v3-turbo-int8-ov-npu

@@ -125,4 +125,159 @@ The provided OpenVINO™ IR model is compatible with:
 ```bash
 optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int8 --disable-stateful whisper-large-v3-turbo-int8-ov
 ```

 ```bash
 optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int8 --disable-stateful whisper-large-v3-turbo-int8-ov
+```
+```python
+```python
+#!/usr/bin/env python3
+import time
+import requests
+import openvino_genai
+import librosa
+from pathlib import Path
+from huggingface_hub import snapshot_download
+def download_model(model_id="FluidInference/whisper-large-v3-turbo-int8-ov-npu"):
+    """Download model from HuggingFace Hub"""
+    local_dir = Path("models") / model_id.split("/")[-1]
+    if local_dir.exists() and any(local_dir.iterdir()):
+        return str(local_dir)
+    print(f"Downloading model...")
+    snapshot_download(
+        repo_id=model_id,
+        local_dir=str(local_dir),
+        local_dir_use_symlinks=False
+    )
+    return str(local_dir)
+def download_hf_audio_samples():
+    """Download audio samples from Hugging Face"""
+    samples_dir = Path("sample_audios")
+    samples_dir.mkdir(exist_ok=True)
+    downloaded = []
+    whisper_samples = [
+        ("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"),
+        ("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"),
+    ]
+    for url, filename in whisper_samples:
+        filepath = samples_dir / filename
+        if filepath.exists():
+            downloaded.append(str(filepath))
+            continue
+        try:
+            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
+            response.raise_for_status()
+            with open(filepath, 'wb') as f:
+                f.write(response.content)
+            downloaded.append(str(filepath))
+        except Exception as e:
+            print(f"Error downloading {filename}: {e}")
+    return downloaded
+def read_audio(filepath):
+    """Read audio file and convert to 16kHz"""
+    try:
+        raw_speech, _ = librosa.load(filepath, sr=16000)
+        return raw_speech.tolist()
+    except Exception as e:
+        print(f"Error reading {filepath}: {e}")
+        return None
+def test_whisper_on_file(pipe, filepath):
+    """Test Whisper on a single audio file"""
+    config = pipe.get_generation_config()
+    config.language = "<|en|>"
+    config.task = "transcribe"
+    config.return_timestamps = True
+    config.max_new_tokens = 448
+    raw_speech = read_audio(filepath)
+    if raw_speech is None:
+        return None
+    duration = len(raw_speech) / 16000
+    start_time = time.time()
+    result = pipe.generate(raw_speech, config)
+    inference_time = time.time() - start_time
+    return {
+        "file": filepath,
+        "duration": duration,
+        "inference_time": inference_time,
+        "rtf": inference_time/duration,
+        "transcription": str(result)
+    }
+def main():
+    # Download model
+    model_path = download_model()
+    # Initialize pipeline on NPU
+    print(f"\nInitializing NPU...")
+    start_time = time.time()
+    pipe = openvino_genai.WhisperPipeline(model_path, "NPU")
+    init_time = time.time() - start_time
+    results = []
+    # Collect test files
+    test_files = []
+    test_files.extend(Path(".").glob("*.wav"))
+    if Path("samples/c/whisper_speech_recognition").exists():
+        test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav"))
+    # Download HF samples
+    hf_samples = download_hf_audio_samples()
+    test_files.extend([Path(f) for f in hf_samples])
+    # Test all files
+    print(f"\nTesting {len(test_files)} files...")
+    for audio_file in test_files:
+        result = test_whisper_on_file(pipe, str(audio_file))
+        if result:
+            results.append(result)
+            print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x")
+    # Print summary
+    if results:
+        total_duration = sum(r["duration"] for r in results)
+        total_inference = sum(r["inference_time"] for r in results)
+        avg_rtf = total_inference / total_duration
+        print(f"\n{'='*50}")
+        print(f"NPU Performance Summary")
+        print(f"{'='*50}")
+        print(f"Model load time: {init_time:.1f}s")
+        print(f"Files tested: {len(results)}")
+        print(f"Total audio: {total_duration:.1f}s")
+        print(f"Total inference: {total_inference:.1f}s")
+        print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}")
+        print(f"\nResults:")
+        for r in results:
+            trans = r['transcription'].strip()
+            if len(trans) > 60:
+                trans = trans[:57] + "..."
+            print(f"- {Path(r['file']).name}: \"{trans}\"")
+if __name__ == "__main__":
+    main()
+```
 ```