File size: 2,426 Bytes
7406d08
7dcc945
 
 
7e263fd
7dcc945
dd4fe31
 
7406d08
1b91c86
 
dd4fe31
 
 
 
1b91c86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd4fe31
 
 
 
 
 
 
 
 
 
 
 
1b91c86
 
 
 
 
dd4fe31
2489c1a
1b91c86
 
 
 
 
dd4fe31
2489c1a
1b91c86
 
2489c1a
 
 
 
 
 
 
1b91c86
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from transformers import pipeline
import soundfile as sf
import numpy as np
import librosa
import gradio as gr
from IPython.display import Audio as IPythonAudio
import torch
import tempfile

asr = pipeline("automatic-speech-recognition", model="distil-whisper/distil-small.en") #sound to text model    

tr = pipeline("translation", model="facebook/nllb-200-distilled-600M", torch_dtype=torch.bfloat16) #text translator model

narrator = pipeline("text-to-speech", model="facebook/mms-tts-spa") #text to speech spanish 

demo = gr.Blocks()
def transcribe_long_form(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    audio, sampling_rate = sf.read(filepath) #reading the converted .wav 
    #converting audio into one dimension (stereo audio has 2, audio and spacial audio. We dont need spacial)
    audio_transposed = np.transpose(audio)
    audio_mono = librosa.to_mono(audio_transposed)
    IPythonAudio(audio_mono, rate=sampling_rate)

    #converting to same sampling rate as model
    audio_16KHz = librosa.resample(audio_mono,
                                orig_sr=sampling_rate,
                                target_sr=16000)
    output = asr(
      audio_16KHz,
      max_new_tokens=256,
      chunk_length_s=30,
      batch_size=12,
    )

    text_translated = tr(output["text"],
                             src_lang="eng_Latn",
                             tgt_lang="spa_Latn")

    completed_translation = text_translated[0]['translation_text']
    narrated_text = narrator(completed_translation)

    # Save the narrated audio to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile: 
        sf.write(tmpfile.name, narrated_text['audio'][0], narrated_text['sampling_rate'])
        return tmpfile.name

mic_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Audio(label="Translated Audio"),
    flagging_mode="auto")

file_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Audio(label="Translated Audio"),
    flagging_mode="auto",
)

with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )
demo.launch()