Spaces:
Sleeping
Sleeping
File size: 1,489 Bytes
82b41e0 e3d3533 82b41e0 e3d3533 88fe0f5 e3d3533 82b41e0 db1b155 e3d3533 82b41e0 e3d3533 82b41e0 e3d3533 82b41e0 e3d3533 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import torch
from transformers import pipeline
EOS_TOKENS = [".", "!", "?"]
def transcribe_audio(audio_fpath, max_snt_len=100):
pipe = pipeline("automatic-speech-recognition",
"openai/whisper-small",
torch_dtype=torch.float16,
device="cuda:0")
pipe.model = pipe.model.to_bettertransformer()
outputs = pipe(audio_fpath,
chunk_length_s=30,
batch_size=8,
return_timestamps=True)
sentences = []
snt_start = None
snt = ""
for segment in outputs["chunks"]:
snt += f'{segment["text"]} '
start_time, end_time = segment["timestamp"]
if end_time is None:
end_time = start_time + 0.1
if not snt_start:
snt_start = start_time
if (
segment["text"].strip().split()[-1][-1] in EOS_TOKENS
or len(snt) > max_snt_len
):
sentences.append(
{"text": snt.strip(), "start": snt_start, "end": end_time}
)
snt_start = None
snt = ""
if len(snt) > 0:
sentences.append(
{"text": snt.strip(), "start": snt_start, "end": end_time}
)
snt_start = None
snt = ""
timestamped_text = ""
for sentence in sentences:
timestamped_text += (
f'{sentence["start"]} {sentence["end"]} {sentence["text"]}\n'
)
return timestamped_text
|