# Setup: # pip install pyannote.audio>=3.1 # Requirement: Sumit access request for the following models. # https://huggingface.co/pyannote/speaker-diarization-3.1 # https://huggingface.co/pyannote/segmentation-3.0 import soundfile as sf import numpy as np from typing import Union, Optional, Dict, List import torch from pyannote.audio import Pipeline class SpeakerDiarization: def __init__(self, model_id: str): self.pipeline = Pipeline.from_pretrained(model_id) def __call__(self, audio: Union[str, torch.Tensor, np.ndarray], sampling_rate: Optional[int] = None) -> Dict[str, List[List[float]]]: if type(audio) is torch.Tensor or type(audio) is np.ndarray: if sampling_rate is None: raise ValueError("sampling_rate must be provided") if type(audio) is np.ndarray: audio = torch.as_tensor(audio) audio = torch.as_tensor(audio, dtype=torch.float32) if len(audio.shape) == 1: audio = audio.unsqueeze(0) elif len(audio.shape) > 3: raise ValueError("audio shape must be (channel, time)") audio = {"waveform": audio, "sample_rate": sampling_rate} output = self.pipeline(audio) # dictionary: {speaker_id: [[start, end],...]} return {s: [[i.start, i.end] for i in output.label_timeline(s)] for s in output.labels()} pipeline = SpeakerDiarization("pyannote/speaker-diarization-3.1") root_dir = "/Users/asahiu/Desktop" sample_audio_files = ["speaker_diariazation_sample_1.wav", "speaker_diariazation_sample_2.wav"] print(sample_audio_file) a, sr = sf.read(f"{root_dir}/{sample_audio_file}") output = pipeline(a, sampling_rate=sr) print(output) output = pipeline(f"{root_dir}/{sample_audio_file}") print(output) print()