youtube_summarizer / audio_to_text.py
ammansik's picture
fix last bugs
db1b155
import torch
from transformers import pipeline
EOS_TOKENS = [".", "!", "?"]
def transcribe_audio(audio_fpath, max_snt_len=100):
pipe = pipeline("automatic-speech-recognition",
"openai/whisper-small",
torch_dtype=torch.float16,
device="cuda:0")
pipe.model = pipe.model.to_bettertransformer()
outputs = pipe(audio_fpath,
chunk_length_s=30,
batch_size=8,
return_timestamps=True)
sentences = []
snt_start = None
snt = ""
for segment in outputs["chunks"]:
snt += f'{segment["text"]} '
start_time, end_time = segment["timestamp"]
if end_time is None:
end_time = start_time + 0.1
if not snt_start:
snt_start = start_time
if (
segment["text"].strip().split()[-1][-1] in EOS_TOKENS
or len(snt) > max_snt_len
):
sentences.append(
{"text": snt.strip(), "start": snt_start, "end": end_time}
)
snt_start = None
snt = ""
if len(snt) > 0:
sentences.append(
{"text": snt.strip(), "start": snt_start, "end": end_time}
)
snt_start = None
snt = ""
timestamped_text = ""
for sentence in sentences:
timestamped_text += (
f'{sentence["start"]} {sentence["end"]} {sentence["text"]}\n'
)
return timestamped_text