import torch from transformers import pipeline EOS_TOKENS = [".", "!", "?"] def transcribe_audio(audio_fpath, max_snt_len=100): pipe = pipeline("automatic-speech-recognition", "openai/whisper-small", torch_dtype=torch.float16, device="cuda:0") pipe.model = pipe.model.to_bettertransformer() outputs = pipe(audio_fpath, chunk_length_s=30, batch_size=8, return_timestamps=True) sentences = [] snt_start = None snt = "" for segment in outputs["chunks"]: snt += f'{segment["text"]} ' start_time, end_time = segment["timestamp"] if end_time is None: end_time = start_time + 0.1 if not snt_start: snt_start = start_time if ( segment["text"].strip().split()[-1][-1] in EOS_TOKENS or len(snt) > max_snt_len ): sentences.append( {"text": snt.strip(), "start": snt_start, "end": end_time} ) snt_start = None snt = "" if len(snt) > 0: sentences.append( {"text": snt.strip(), "start": snt_start, "end": end_time} ) snt_start = None snt = "" timestamped_text = "" for sentence in sentences: timestamped_text += ( f'{sentence["start"]} {sentence["end"]} {sentence["text"]}\n' ) return timestamped_text