metadata

language:
  - en
  - zh
  - de
  - es
  - ru
  - ko
  - fr
  - ja
  - pt
  - tr
  - pl
  - ca
  - nl
  - ar
  - sv
  - it
  - id
  - hi
  - fi
  - vi
  - he
  - uk
  - el
  - ms
  - cs
  - ro
  - da
  - hu
  - ta
  - 'no'
  - th
  - ur
  - hr
  - bg
  - lt
  - la
  - mi
  - ml
  - cy
  - sk
  - te
  - fa
  - lv
  - bn
  - sr
  - az
  - sl
  - kn
  - et
  - mk
  - br
  - eu
  - is
  - hy
  - ne
  - mn
  - bs
  - kk
  - sq
  - sw
  - gl
  - mr
  - pa
  - si
  - km
  - sn
  - yo
  - so
  - af
  - oc
  - ka
  - be
  - tg
  - sd
  - gu
  - am
  - yi
  - lo
  - uz
  - fo
  - ht
  - ps
  - tk
  - nn
  - mt
  - sa
  - lb
  - my
  - bo
  - tl
  - mg
  - as
  - tt
  - haw
  - ln
  - ha
  - ba
  - jw
  - su
tags:
  - audio
  - automatic-speech-recognition
  - hf-asr-leaderboard
pipeline_tag: automatic-speech-recognition
license: apache-2.0
license_link: https://choosealicense.com/licenses/apache-2.0/
base_model:
  - openai/whisper-large-v3-turbo

whisper-large-v3-fp16-ov

Model creator: OpenAI
Original model: whisper-large-v3

Description

This is whisper-large-v3 model converted to the OpenVINO™ IR (Intermediate Representation) format with weights compressed to FP16.

Compatibility

The provided OpenVINO™ IR model is compatible with:

OpenVINO version 2025.2.0 and higher
Optimum Intel 1.23.0 and higher

optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int4 --disable-stateful whisper-large-v3-turbo-int4-ov

#!/usr/bin/env python3
import time
import requests
import openvino_genai
import librosa
from pathlib import Path
from huggingface_hub import snapshot_download


def download_model(model_id="FluidInference/whisper-large-v3-turbo-int4-ov-npu"):
    """Download model from HuggingFace Hub"""
    local_dir = Path("models") / model_id.split("/")[-1]

    if local_dir.exists() and any(local_dir.iterdir()):
        return str(local_dir)

    print(f"Downloading model...")
    snapshot_download(
        repo_id=model_id,
        local_dir=str(local_dir),
        local_dir_use_symlinks=False
    )
    return str(local_dir)


def download_hf_audio_samples():
    """Download audio samples from Hugging Face"""
    samples_dir = Path("sample_audios")
    samples_dir.mkdir(exist_ok=True)

    downloaded = []
    whisper_samples = [
        ("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"),
        ("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"),
    ]

    for url, filename in whisper_samples:
        filepath = samples_dir / filename
        if filepath.exists():
            downloaded.append(str(filepath))
            continue

        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            response.raise_for_status()

            with open(filepath, 'wb') as f:
                f.write(response.content)

            downloaded.append(str(filepath))
        except Exception as e:
            print(f"Error downloading {filename}: {e}")

    return downloaded


def read_audio(filepath):
    """Read audio file and convert to 16kHz"""
    try:
        raw_speech, _ = librosa.load(filepath, sr=16000)
        return raw_speech.tolist()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None


def test_whisper_on_file(pipe, filepath):
    """Test Whisper on a single audio file"""
    config = pipe.get_generation_config()
    config.language = "<|en|>"
    config.task = "transcribe"
    config.return_timestamps = True
    config.max_new_tokens = 448

    raw_speech = read_audio(filepath)
    if raw_speech is None:
        return None

    duration = len(raw_speech) / 16000

    start_time = time.time()
    result = pipe.generate(raw_speech, config)
    inference_time = time.time() - start_time

    return {
        "file": filepath,
        "duration": duration,
        "inference_time": inference_time,
        "rtf": inference_time/duration,
        "transcription": str(result)
    }


def main():
    # Download model
    model_path = download_model()

    # Initialize pipeline on NPU
    print(f"\nInitializing NPU...")
    start_time = time.time()
    pipe = openvino_genai.WhisperPipeline(model_path, "NPU")
    init_time = time.time() - start_time

    results = []

    # Collect test files
    test_files = []
    test_files.extend(Path(".").glob("*.wav"))

    if Path("samples/c/whisper_speech_recognition").exists():
        test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav"))

    # Download HF samples
    hf_samples = download_hf_audio_samples()
    test_files.extend([Path(f) for f in hf_samples])

    # Test all files
    print(f"\nTesting {len(test_files)} files...")
    for audio_file in test_files:
        result = test_whisper_on_file(pipe, str(audio_file))
        if result:
            results.append(result)
            print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x")

    # Print summary
    if results:
        total_duration = sum(r["duration"] for r in results)
        total_inference = sum(r["inference_time"] for r in results)
        avg_rtf = total_inference / total_duration

        print(f"\n{'='*50}")
        print(f"NPU Performance Summary")
        print(f"{'='*50}")
        print(f"Model load time: {init_time:.1f}s")
        print(f"Files tested: {len(results)}")
        print(f"Total audio: {total_duration:.1f}s")
        print(f"Total inference: {total_inference:.1f}s")
        print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}")

        print(f"\nResults:")
        for r in results:
            trans = r['transcription'].strip()
            if len(trans) > 60:
                trans = trans[:57] + "..."
            print(f"- {Path(r['file']).name}: \"{trans}\"")


if __name__ == "__main__":
    main()