Spaces:
Sleeping
Sleeping
File size: 2,426 Bytes
7406d08 7dcc945 7e263fd 7dcc945 dd4fe31 7406d08 1b91c86 dd4fe31 1b91c86 dd4fe31 1b91c86 dd4fe31 2489c1a 1b91c86 dd4fe31 2489c1a 1b91c86 2489c1a 1b91c86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from transformers import pipeline
import soundfile as sf
import numpy as np
import librosa
import gradio as gr
from IPython.display import Audio as IPythonAudio
import torch
import tempfile
asr = pipeline("automatic-speech-recognition", model="distil-whisper/distil-small.en") #sound to text model
tr = pipeline("translation", model="facebook/nllb-200-distilled-600M", torch_dtype=torch.bfloat16) #text translator model
narrator = pipeline("text-to-speech", model="facebook/mms-tts-spa") #text to speech spanish
demo = gr.Blocks()
def transcribe_long_form(filepath):
if filepath is None:
gr.Warning("No audio found, please retry.")
return ""
audio, sampling_rate = sf.read(filepath) #reading the converted .wav
#converting audio into one dimension (stereo audio has 2, audio and spacial audio. We dont need spacial)
audio_transposed = np.transpose(audio)
audio_mono = librosa.to_mono(audio_transposed)
IPythonAudio(audio_mono, rate=sampling_rate)
#converting to same sampling rate as model
audio_16KHz = librosa.resample(audio_mono,
orig_sr=sampling_rate,
target_sr=16000)
output = asr(
audio_16KHz,
max_new_tokens=256,
chunk_length_s=30,
batch_size=12,
)
text_translated = tr(output["text"],
src_lang="eng_Latn",
tgt_lang="spa_Latn")
completed_translation = text_translated[0]['translation_text']
narrated_text = narrator(completed_translation)
# Save the narrated audio to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
sf.write(tmpfile.name, narrated_text['audio'][0], narrated_text['sampling_rate'])
return tmpfile.name
mic_transcribe = gr.Interface(
fn=transcribe_long_form,
inputs=gr.Audio(sources="microphone",
type="filepath"),
outputs=gr.Audio(label="Translated Audio"),
flagging_mode="auto")
file_transcribe = gr.Interface(
fn=transcribe_long_form,
inputs=gr.Audio(sources="upload",
type="filepath"),
outputs=gr.Audio(label="Translated Audio"),
flagging_mode="auto",
)
with demo:
gr.TabbedInterface(
[mic_transcribe,
file_transcribe],
["Transcribe Microphone",
"Transcribe Audio File"],
)
demo.launch() |