|
import os |
|
|
|
from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio |
|
from faster_whisper.tokenizer import Tokenizer |
|
from faster_whisper.transcribe import get_suppressed_tokens |
|
|
|
|
|
def test_supported_languages(): |
|
model = WhisperModel("tiny.en") |
|
assert model.supported_languages == ["en"] |
|
|
|
|
|
def test_transcribe(jfk_path): |
|
model = WhisperModel("tiny") |
|
segments, info = model.transcribe(jfk_path, word_timestamps=True) |
|
assert info.all_language_probs is not None |
|
|
|
assert info.language == "en" |
|
assert info.language_probability > 0.9 |
|
assert info.duration == 11 |
|
|
|
|
|
|
|
top_lang, top_lang_score = info.all_language_probs[0] |
|
assert info.language == top_lang |
|
assert abs(info.language_probability - top_lang_score) < 1e-16 |
|
|
|
segments = list(segments) |
|
|
|
assert len(segments) == 1 |
|
|
|
segment = segments[0] |
|
|
|
assert segment.text == ( |
|
" And so my fellow Americans ask not what your country can do for you, " |
|
"ask what you can do for your country." |
|
) |
|
|
|
assert segment.text == "".join(word.word for word in segment.words) |
|
assert segment.start == segment.words[0].start |
|
assert segment.end == segment.words[-1].end |
|
batched_model = BatchedInferencePipeline(model=model, use_vad_model=False) |
|
result, info = batched_model.transcribe(jfk_path, word_timestamps=True) |
|
assert info.language == "en" |
|
assert info.language_probability > 0.7 |
|
segments = [] |
|
for segment in result: |
|
segments.append( |
|
{"start": segment.start, "end": segment.end, "text": segment.text} |
|
) |
|
|
|
assert len(segments) == 1 |
|
assert segment.text == ( |
|
" And so my fellow Americans ask not what your country can do for you, " |
|
"ask what you can do for your country." |
|
) |
|
|
|
|
|
def test_batched_transcribe(physcisworks_path): |
|
model = WhisperModel("tiny") |
|
batched_model = BatchedInferencePipeline(model=model) |
|
result, info = batched_model.transcribe(physcisworks_path, batch_size=16) |
|
assert info.language == "en" |
|
assert info.language_probability > 0.7 |
|
segments = [] |
|
for segment in result: |
|
segments.append( |
|
{"start": segment.start, "end": segment.end, "text": segment.text} |
|
) |
|
|
|
assert len(segments) == 8 |
|
|
|
result, info = batched_model.transcribe( |
|
physcisworks_path, |
|
batch_size=16, |
|
without_timestamps=False, |
|
word_timestamps=True, |
|
) |
|
segments = [] |
|
for segment in result: |
|
assert segment.words is not None |
|
segments.append( |
|
{"start": segment.start, "end": segment.end, "text": segment.text} |
|
) |
|
assert len(segments) > 8 |
|
|
|
|
|
def test_prefix_with_timestamps(jfk_path): |
|
model = WhisperModel("tiny") |
|
segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans") |
|
segments = list(segments) |
|
|
|
assert len(segments) == 1 |
|
|
|
segment = segments[0] |
|
|
|
assert segment.text == ( |
|
" And so my fellow Americans ask not what your country can do for you, " |
|
"ask what you can do for your country." |
|
) |
|
|
|
assert segment.start == 0 |
|
assert 10 < segment.end < 11 |
|
|
|
|
|
def test_vad(jfk_path): |
|
model = WhisperModel("tiny") |
|
segments, info = model.transcribe( |
|
jfk_path, |
|
vad_filter=True, |
|
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200), |
|
) |
|
segments = list(segments) |
|
|
|
assert len(segments) == 1 |
|
segment = segments[0] |
|
|
|
assert segment.text == ( |
|
" And so my fellow Americans ask not what your country can do for you, " |
|
"ask what you can do for your country." |
|
) |
|
|
|
assert 0 < segment.start < 1 |
|
assert 10 < segment.end < 11 |
|
|
|
assert info.vad_options.min_silence_duration_ms == 500 |
|
assert info.vad_options.speech_pad_ms == 200 |
|
|
|
|
|
def test_stereo_diarization(data_dir): |
|
model = WhisperModel("tiny") |
|
|
|
audio_path = os.path.join(data_dir, "stereo_diarization.wav") |
|
left, right = decode_audio(audio_path, split_stereo=True) |
|
|
|
segments, _ = model.transcribe(left) |
|
transcription = "".join(segment.text for segment in segments).strip() |
|
assert transcription == ( |
|
"He began a confused complaint against the wizard, " |
|
"who had vanished behind the curtain on the left." |
|
) |
|
|
|
segments, _ = model.transcribe(right) |
|
transcription = "".join(segment.text for segment in segments).strip() |
|
assert transcription == "The horizon seems extremely distant." |
|
|
|
|
|
def test_multisegment_lang_id(physcisworks_path): |
|
model = WhisperModel("tiny") |
|
language_info = model.detect_language_multi_segment(physcisworks_path) |
|
assert language_info["language_code"] == "en" |
|
assert language_info["language_confidence"] > 0.8 |
|
|
|
|
|
def test_suppressed_tokens_minus_1(): |
|
model = WhisperModel("tiny.en") |
|
|
|
tokenizer = Tokenizer(model.hf_tokenizer, False) |
|
tokens = get_suppressed_tokens(tokenizer, [-1]) |
|
assert tokens == ( |
|
1, |
|
2, |
|
7, |
|
8, |
|
9, |
|
10, |
|
14, |
|
25, |
|
26, |
|
27, |
|
28, |
|
29, |
|
31, |
|
58, |
|
59, |
|
60, |
|
61, |
|
62, |
|
63, |
|
90, |
|
91, |
|
92, |
|
93, |
|
357, |
|
366, |
|
438, |
|
532, |
|
685, |
|
705, |
|
796, |
|
930, |
|
1058, |
|
1220, |
|
1267, |
|
1279, |
|
1303, |
|
1343, |
|
1377, |
|
1391, |
|
1635, |
|
1782, |
|
1875, |
|
2162, |
|
2361, |
|
2488, |
|
3467, |
|
4008, |
|
4211, |
|
4600, |
|
4808, |
|
5299, |
|
5855, |
|
6329, |
|
7203, |
|
9609, |
|
9959, |
|
10563, |
|
10786, |
|
11420, |
|
11709, |
|
11907, |
|
13163, |
|
13697, |
|
13700, |
|
14808, |
|
15306, |
|
16410, |
|
16791, |
|
17992, |
|
19203, |
|
19510, |
|
20724, |
|
22305, |
|
22935, |
|
27007, |
|
30109, |
|
30420, |
|
33409, |
|
34949, |
|
40283, |
|
40493, |
|
40549, |
|
47282, |
|
49146, |
|
50257, |
|
50357, |
|
50358, |
|
50359, |
|
50360, |
|
) |
|
|
|
|
|
def test_suppressed_tokens_minus_value(): |
|
model = WhisperModel("tiny.en") |
|
|
|
tokenizer = Tokenizer(model.hf_tokenizer, False) |
|
tokens = get_suppressed_tokens(tokenizer, [13]) |
|
assert tokens == (13, 50257, 50357, 50358, 50359, 50360) |
|
|