|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- mozilla-foundation/common_voice_17_0 |
|
language: |
|
- uz |
|
metrics: |
|
- wer |
|
base_model: facebook/wav2vec2-base-960h |
|
pipeline_tag: automatic-speech-recognition |
|
library_name: adapter-transformers |
|
--- |
|
|
|
# Author |
|
|
|
Mamayusupov Rifat. |
|
|
|
# Usage |
|
|
|
``` |
|
from transformers import SeamlessM4TFeatureExtractor, Wav2Vec2BertProcessor, Wav2Vec2CTCTokenizer, Wav2Vec2BertForCTC |
|
from transformers import pipeline |
|
|
|
# Initialize tokenizer |
|
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("/home/rifat/asr", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") |
|
|
|
# Initialize feature extractor |
|
feature_extractor = SeamlessM4TFeatureExtractor(feature_size=80, num_mel_bins=80, sampling_rate=16000, padding_value=0.0) |
|
|
|
# Initialize processor |
|
processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) |
|
|
|
# Initialize model |
|
model = Wav2Vec2BertForCTC.from_pretrained( |
|
args.pretrained_model, |
|
attention_dropout=0.0, |
|
hidden_dropout=0.0, |
|
feat_proj_dropout=0.0, |
|
mask_time_prob=0.0, |
|
layerdrop=0.0, |
|
ctc_loss_reduction="mean", |
|
add_adapter=True, |
|
pad_token_id=processor.tokenizer.pad_token_id, |
|
vocab_size=len(processor.tokenizer), |
|
ignore_mismatched_sizes=True |
|
) |
|
|
|
model.config.ctc_zero_infinity = True |
|
model.to("cuda") |
|
|
|
# Perform inference |
|
# Initialize the pipeline |
|
pipe = pipeline(model=model, tokenizer=processor.tokenizer, feature_extractor=feature_extractor, task="automatic-speech-recognition") |
|
|
|
input_audio = "" |
|
print(pipe(input_audio)['result_text']) |
|
|
|
``` |
|
|