Spaces:
Sleeping
Sleeping
import torch | |
from transformers import pipeline | |
EOS_TOKENS = [".", "!", "?"] | |
def transcribe_audio(audio_fpath, max_snt_len=100): | |
pipe = pipeline("automatic-speech-recognition", | |
"openai/whisper-small", | |
torch_dtype=torch.float16, | |
device="cuda:0") | |
pipe.model = pipe.model.to_bettertransformer() | |
outputs = pipe(audio_fpath, | |
chunk_length_s=30, | |
batch_size=8, | |
return_timestamps=True) | |
sentences = [] | |
snt_start = None | |
snt = "" | |
for segment in outputs["chunks"]: | |
snt += f'{segment["text"]} ' | |
start_time, end_time = segment["timestamp"] | |
if end_time is None: | |
end_time = start_time + 0.1 | |
if not snt_start: | |
snt_start = start_time | |
if ( | |
segment["text"].strip().split()[-1][-1] in EOS_TOKENS | |
or len(snt) > max_snt_len | |
): | |
sentences.append( | |
{"text": snt.strip(), "start": snt_start, "end": end_time} | |
) | |
snt_start = None | |
snt = "" | |
if len(snt) > 0: | |
sentences.append( | |
{"text": snt.strip(), "start": snt_start, "end": end_time} | |
) | |
snt_start = None | |
snt = "" | |
timestamped_text = "" | |
for sentence in sentences: | |
timestamped_text += ( | |
f'{sentence["start"]} {sentence["end"]} {sentence["text"]}\n' | |
) | |
return timestamped_text | |