Spaces:

tonyliu404
/

Audio-to-text-Translated

Sleeping

File size: 2,426 Bytes

from transformers import pipeline
import soundfile as sf
import numpy as np
import librosa
import gradio as gr
from IPython.display import Audio as IPythonAudio
import torch
import tempfile

asr = pipeline("automatic-speech-recognition", model="distil-whisper/distil-small.en") #sound to text model    

tr = pipeline("translation", model="facebook/nllb-200-distilled-600M", torch_dtype=torch.bfloat16) #text translator model

narrator = pipeline("text-to-speech", model="facebook/mms-tts-spa") #text to speech spanish 

demo = gr.Blocks()
def transcribe_long_form(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    audio, sampling_rate = sf.read(filepath) #reading the converted .wav 
    #converting audio into one dimension (stereo audio has 2, audio and spacial audio. We dont need spacial)
    audio_transposed = np.transpose(audio)
    audio_mono = librosa.to_mono(audio_transposed)
    IPythonAudio(audio_mono, rate=sampling_rate)

    #converting to same sampling rate as model
    audio_16KHz = librosa.resample(audio_mono,
                                orig_sr=sampling_rate,
                                target_sr=16000)
    output = asr(
      audio_16KHz,
      max_new_tokens=256,
      chunk_length_s=30,
      batch_size=12,
    )

    text_translated = tr(output["text"],
                             src_lang="eng_Latn",
                             tgt_lang="spa_Latn")

    completed_translation = text_translated[0]['translation_text']
    narrated_text = narrator(completed_translation)

    # Save the narrated audio to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile: 
        sf.write(tmpfile.name, narrated_text['audio'][0], narrated_text['sampling_rate'])
        return tmpfile.name

mic_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Audio(label="Translated Audio"),
    flagging_mode="auto")

file_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Audio(label="Translated Audio"),
    flagging_mode="auto",
)

with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )
demo.launch()