BUT-FIT
/

wav2vec2-base_bart-base_voxpopuli-en

@@ -135,127 +135,49 @@ For detailed training logs, metrics, and visualizations, please refer to the Wei
 ## How to Use
-You can use this model for inference with the Hugging Face `transformers` library. Make sure you have `torchaudio` and `librosa` (or `soundfile`) installed for audio processing.
 ```python
 from transformers import SpeechEncoderDecoderModel, AutoProcessor
 import torch
-import soundfile as sf
-model_id = "matejhornik/wav2vec2-base_bart-base_voxpopuli-en"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load the processor (feature extractor and tokenizer)
-processor = AutoProcessor.from_pretrained(model_id)
-# Load the model
-model = SpeechEncoderDecoderModel.from_pretrained(model_id).to(device)
-def transcribe_audio(audio_path):
-    """Loads audio, processes it, and transcribes it."""
-    speech_array, sampling_rate = sf.read(audio_path)
-    # Ensure audio is 16kHz as expected by the model
-    if sampling_rate != processor.feature_extractor.sampling_rate:
-        raise ValueError(f"Audio sampling rate {sampling_rate} does not match model's required {processor.feature_extractor.sampling_rate}Hz. Please resample.")
-    # Preprocess the audio
-    inputs = processor(speech_array, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)
-    input_features = inputs.input_features.to(device)
-    attention_mask = inputs.attention_mask.to(device)
-    # Generate transcription
-    with torch.no_grad():
-        predicted_ids = model.generate(input_features, attention_mask=attention_mask, max_length=128)
-    # Decode the transcription
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
-    return transcription[0]
-# Example usage:
-audio_file_path = "path/to/your/audio.wav"
-try:
-   transcription = transcribe_audio(audio_file_path)
-   print(f"Transcription: {transcription}")
-except ValueError as e:
-   print(e)
-except FileNotFoundError:
-   print(f"Audio file not found at: {audio_file_path}. Please provide a valid path.")
-```
-## Reproducing Evaluation on VoxPopuli
-To reproduce the evaluation results on the VoxPopuli test set:
-```python
 from datasets import load_dataset
-from transformers import SpeechEncoderDecoderModel, AutoProcessor
-import torch
-from jiwer import wer
-from tqdm import tqdm
-model_id = "matejhornik/wav2vec2-base_bart-base_voxpopuli-en"
-dataset_name = "facebook/voxpopuli"
-dataset_config = "en"
-split = "test" # or "validation"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load processor and model
-processor = AutoProcessor.from_pretrained(model_id)
-model = SpeechEncoderDecoderModel.from_pretrained(model_id).to(device)
-model.eval() # Set model to evaluation mode
-# Load dataset
-# Note: You might need to authenticate with Hugging Face if the dataset requires it
-# from huggingface_hub import login
-voxpopuli_test = load_dataset(dataset_name, dataset_config, split=split, streaming=False) # Set streaming=True for large datasets if memory is an issue
-# Preprocessing function
-def map_to_pred(batch):
-    # Ensure audio is in the correct format (array, 16kHz)
-    audio_data = batch["audio"]["array"]
-    sampling_rate = batch["audio"]["sampling_rate"]
-    if sampling_rate != processor.feature_extractor.sampling_rate:
-        print(f"Warning: Resampling needed or sample skipped for audio with rate {sampling_rate}")
-        # Dummy processing for now if rate mismatch
-        input_features = torch.zeros((1,1000)) # Placeholder
-    else:
-        inputs = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
-        input_features = inputs.input_features.to(device)
-    with torch.no_grad():
-        predicted_ids = model.generate(input_features, max_length=128)
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
-    batch["prediction"] = transcription[0]
-    batch["reference"] = batch["normalized_text"]
-    return batch
-predictions = []
-references = []
-for sample in tqdm(voxpopuli_test):
-    try:
-        processed_sample = map_to_pred(sample)
-        predictions.append(processed_sample["prediction"])
-        references.append(processed_sample["reference"])
-    except Exception as e:
-        print(f"Error processing sample: {e}")
-# Calculate WER
-if predictions and references:
-    current_wer = wer(references, predictions)
-    print(f"WER on {split} set: {current_wer:.4f}")
-else:
-    print("No samples processed or an error occurred.")
-# Expected WER on test set: 0.0885
-# Expected WER on validation set: 0.0855
 ```
 ### Framework Versions
 This model was trained using:
@@ -268,6 +190,8 @@ This model was trained using:
 - Evaluate: `^0.4.3`
 - WandB: `^0.19.7`
 ## Citation
 Citation
 If you use this model or findings from the thesis, please cite:
@@ -295,4 +219,6 @@ If you use this model or findings from the thesis, please cite:
 For questions, feedback, or collaboration opportunities related to this thesis or any other stuff, feel free to reach out:
 - **Email:** [email protected] / [email protected]
-- **GitHub:** [hornikmatej](https://github.com/hornikmatej)

 ## How to Use
+You can use this model for inference with the Hugging Face `transformers` library.
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hornikmatej/thesis_mit/blob/main/graphs/colab_ntb.ipynb)
 ```python
 from transformers import SpeechEncoderDecoderModel, AutoProcessor
 import torch
 from datasets import load_dataset
+MODEL_ID = "matejhornik/wav2vec2-base_bart-base_voxpopuli-en"
+DATASET_ID = "facebook/voxpopuli"
+DATASET_CONFIG = "en"
+DATASET_SPLIT = "test" # "validation"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = SpeechEncoderDecoderModel.from_pretrained(MODEL_ID).to(device)
+print(f"Using device: {device}\nStreaming one sample from '{DATASET_ID}'"
+"(config: '{DATASET_CONFIG}', split: '{DATASET_SPLIT}')...")
+streamed_dataset = load_dataset(
+    DATASET_ID,
+    DATASET_CONFIG,
+    split=DATASET_SPLIT,
+    streaming=True,
+)
+sample = next(iter(streamed_dataset))
+audio_input = sample["audio"]["array"]
+input_sampling_rate = sample["audio"]["sampling_rate"]
+inputs = processor(audio_input, sampling_rate=input_sampling_rate, return_tensors="pt", padding=True)
+input_features = inputs.input_values.to(device)
+with torch.no_grad():
+    predicted_ids = model.generate(input_features, max_length=128)
+transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+print(f"\nOriginal: {sample['normalized_text']}")
+print(f"Transcribed: {transcription}")
 ```
 ### Framework Versions
 This model was trained using:
 - Evaluate: `^0.4.3`
 - WandB: `^0.19.7`
+Visit the [pyproject.toml](https://github.com/hornikmatej/thesis_mit/blob/main/pyproject.toml) file for a complete list of dependencies.
 ## Citation
 Citation
 If you use this model or findings from the thesis, please cite:
 For questions, feedback, or collaboration opportunities related to this thesis or any other stuff, feel free to reach out:
 - **Email:** [email protected] / [email protected]
+- **GitHub:** [hornikmatej](https://github.com/hornikmatej)