File size: 1,444 Bytes
42c9935 104d008 42c9935 338197e 104d008 42c9935 338197e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import copy
import time
import gradio as gr
import numpy as np
import torch
import torchaudio
from loguru import logger
from transformers import pipeline
transcriber = None
def load_stt():
device = "cuda" if torch.cuda.is_available() else "cpu"
transcriber = pipeline(
"automatic-speech-recognition", model="openai/whisper-base.en", device=device
)
return transcriber
def save_audio_as_wav(data, sample_rate, file_path):
# make a tensor from the numpy array
data = torch.tensor(data).reshape(1, -1)
torchaudio.save(
file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding="PCM_S"
)
def transcribe_audio(audio):
global transcriber
if transcriber is None:
transcriber = load_stt()
sample_rate, data = audio
try:
data = data.astype(np.float32)
data /= np.max(np.abs(data))
text = transcriber({"sampling_rate": sample_rate, "raw": data})["text"]
gr.Info(f"Transcribed text is: {text}\nProcessing the input...")
except Exception as e:
logger.error(f"Error: {e}")
raise Exception("Error transcribing audio.")
return text
def save_and_transcribe_audio(audio, save=True):
sample_rate, data = audio
# add timestamp to file name
filename = f"recordings/audio{time.time()}.wav"
if save:
save_audio_as_wav(data, sample_rate, filename)
return transcribe_audio(audio)
|