kotoba-tech
/

kotoba-whisper-v1.0-faster

Automatic Speech Recognition

Model card Files Files and versions

kotoba-whisper-v1.0-faster / benchmark.sh

asahi417's picture

Update benchmark.sh

7ec3fd2 verified over 1 year ago

2.21 kB

	# clone dataset
	git clone https://huggingface.co/datasets/kotoba-tech/kotoba-whisper-eval
	# convert to 16khz
	ffmpeg -i kotoba-whisper-eval/audio/long_interview_1.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/long_interview_1.wav
	ffmpeg -i kotoba-whisper-eval/audio/manzai1.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/manzai1.wav
	ffmpeg -i kotoba-whisper-eval/audio/manzai2.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/manzai2.wav
	ffmpeg -i kotoba-whisper-eval/audio/manzai3.mp3 -ar 16000 -ac 1 -c:a pcm_s16le kotoba-whisper-eval/audio/manzai3.wav
	# cache the model
	python -c 'from faster_whisper import WhisperModel; model = WhisperModel("kotoba-tech/kotoba-whisper-v1.0-faster")'
	SECONDS=0
	python -c 'from faster_whisper import WhisperModel; model = WhisperModel("kotoba-tech/kotoba-whisper-v1.0-faster"); print(["[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text) for segment in model.transcribe("kotoba-whisper-eval/audio/long_interview_1.wav", language="ja", chunk_length=15, condition_on_previous_text=False)[0]])'
	TIME_INTERVIEW=$SECONDS
	SECONDS=0
	python -c 'from faster_whisper import WhisperModel; model = WhisperModel("kotoba-tech/kotoba-whisper-v1.0-faster"); print(["[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text) for segment in model.transcribe("kotoba-whisper-eval/audio/manzai1.wav", language="ja", chunk_length=15, condition_on_previous_text=False)[0]])'
	TIME_MANZAI1=$SECONDS
	SECONDS=0
	python -c 'from faster_whisper import WhisperModel; model = WhisperModel("kotoba-tech/kotoba-whisper-v1.0-faster"); print(["[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text) for segment in model.transcribe("kotoba-whisper-eval/audio/manzai2.wav", language="ja", chunk_length=15, condition_on_previous_text=False)[0]])'
	TIME_MANZAI2=$SECONDS
	SECONDS=0
	python -c 'from faster_whisper import WhisperModel; model = WhisperModel("kotoba-tech/kotoba-whisper-v1.0-faster"); print(["[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text) for segment in model.transcribe("kotoba-whisper-eval/audio/manzai3.wav", language="ja", chunk_length=15, condition_on_previous_text=False)[0]])'
	TIME_MANZAI3=$SECONDS