kotoba-tech
/

kotoba-whisper-v1.0

@@ -239,7 +239,6 @@ pipe = pipeline(
     model_kwargs=model_kwargs
 )
 # load sample audio & downsample to 16kHz
 dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
@@ -296,60 +295,45 @@ pip install --upgrade transformers datasets[audio] evaluate jiwer
 Evaluation can then be run end-to-end with the following example:
 ```python
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 from datasets import load_dataset, Audio
 from evaluate import load
-import torch
-from tqdm import tqdm
-# config
 model_id = "kotoba-tech/kotoba-whisper-v1.0"
-dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 audio_column = 'audio'
 text_column = 'transcription'
-batch_size = 16
 # load model
-model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
-model.to(device)
-processor = AutoProcessor.from_pretrained(model_id)
 # load the dataset and sample the audio with 16kHz
 dataset = load_dataset(dataset_name, split="test")
-dataset = dataset.cast_column(audio_column, Audio(sampling_rate=processor.feature_extractor.sampling_rate))
-# preprocess and batch the dataset
-def inference(batch):
-    # 1. Pre-process the audio data to log-mel spectrogram inputs
-    audio = [sample["array"] for sample in batch["audio"]]
-    input_features = processor(audio, sampling_rate=batch["audio"][0]["sampling_rate"], return_tensors="pt").input_features
-    input_features = input_features.to(device, dtype=torch_dtype)
-    # 2. Auto-regressively generate the predicted token ids
-    pred_ids = model.generate(input_features, language="ja", max_new_tokens=128)
-    # 3. Decode the token ids to the final transcription
-    batch["transcription"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
-    batch["reference"] = batch[text_column]
-    return batch
-dataset = dataset.map(function=inference, batched=True, batch_size=batch_size)
-# iterate over the dataset and run inference
-all_transcriptions = []
-all_references = []
-for result in tqdm(dataset, desc="Evaluating..."):
-    all_transcriptions.append(result["transcription"])
-    all_references.append(result["reference"])
-# normalize predictions and references
-all_transcriptions = [transcription.replace(" ", "") for transcription in all_transcriptions]
-all_references = [reference.replace(" ", "") for reference in all_references]
 # compute the CER metric
 cer_metric = load("cer")
-cer = 100 * cer_metric.compute(predictions=all_transcriptions, references=all_references)
 print(cer)
 ```

     model_kwargs=model_kwargs
 )
 # load sample audio & downsample to 16kHz
 dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
 Evaluation can then be run end-to-end with the following example:
 ```python
+from tqdm import tqdm
+import torch
+from transformers import pipeline
 from datasets import load_dataset, Audio
 from evaluate import load
+# model config
 model_id = "kotoba-tech/kotoba-whisper-v1.0"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
+generate_kwargs = {"language": "japanese", "task": "transcribe"}
+# data config
+generate_kwargs = {"language": "japanese", "task": "transcribe"}
+dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
 audio_column = 'audio'
 text_column = 'transcription'
 # load model
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model_id,
+    torch_dtype=torch_dtype,
+    device=device,
+    model_kwargs=model_kwargs,
+    batch_size=16
+)
 # load the dataset and sample the audio with 16kHz
 dataset = load_dataset(dataset_name, split="test")
+transcriptions = pipe(dataset['audio'])
+transcriptions = [i['text'].replace(" ", "") for i in transcriptions]
+references = [i.replace(" ", "") for i in dataset['transcription']]
 # compute the CER metric
 cer_metric = load("cer")
+cer = 100 * cer_metric.compute(predictions=transcriptions, references=references)
 print(cer)
 ```