File size: 4,694 Bytes
0cea3a7
11b9fcd
0cea3a7
9cff099
0cea3a7
 
 
 
 
9cff099
 
0cea3a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cff099
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
os.system("pip install numpy==1.23.0") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24
os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
os.system("pip install torch accelerate torchaudio datasets librosa easymms")

import gradio as gr
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
from datasets import load_dataset, Audio, Dataset
import torch
import librosa #For converting audio sample rate to 16k
from easymms.models.tts import TTSModel #For TTS inference using EasyMMS

model_id = "facebook/mms-1b-all"

#Set target language to dtp (Kadazandusun)
processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
processor.tokenizer.set_target_lang("dtp") #Change dtp to tih for Timugon Murut or iba for Iban 
model.load_adapter("dtp")

asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"

def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
  speech, sample_rate = librosa.load(input)
  speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
  loaded_audio = Dataset.from_dict({"audio": [input]}).cast_column("audio", Audio(sampling_rate=16000))
  audio_to_array = loaded_audio[0]["audio"]["array"]
  return audio_to_array

def transcribe(input): #Gradio UI wrapper function
    audioarray = preprocess(input) #Call preprocessor function
    out = run(audioarray)
    return out
    #transcription = asr_pipeline(audioarray)
    #return transcription["text"]

def run(input):
    inputs = processor(input, sampling_rate=16_000, return_tensors="pt")#.to("cuda")
    with torch.no_grad():
        outputs = model(**inputs).logits
    ids = torch.argmax(outputs, dim=-1)[0]
    transcription = processor.decode(ids)
    return transcription


with gr.Blocks(theme = gr.themes.Soft()) as demo:
    gr.HTML(
        """
<h1 align="center">Ponutun Tuturan om Pomorolou Sinuat Boros Dusun</h1>
<h5 align="center">  Poomitanan kopogunaan do somit tutun tuturan om pomorolou sinuat (speech recognition and text-to-speech models)
  pinoluda' di Woyotanud Tuturan Gumukabang Tagayo di Meta (Meta Massive Multilingual Speech Project)</h5>
<h6 align = "center">Guguno (app) diti winonsoi di Ander © 2023 id Universiti Teknologi PETRONAS</h6>
<style>
    .container {
      display: grid;
      grid-template-columns:20% 5% 20%;
      align-items: center;
    }
</style>
<h6 align = "center">
<div class = "container">
    <div class = "image"> <a href='https://github.com/andergisomon/dtp-nlp-demo'><img src='https://img.shields.io/badge/Github-Code-success'></a> </div>
    <div class = "image"></div>
    <div class = "image"> <a href='https://huggingface.co/spaces/anderbogia/dtp-asr-demo-v2/'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> </div>
</div></h6>
        """)

    tts = TTSModel('dtp')

    def fn2(input):
        res = tts.synthesize(input)
        flip_tuple = (res[1], res[0]) #EasyMMS synthesize() returns Tuple(data, sample_rate) where data is a numpy.array and sample_rate is int,
                                      #but Gradio Audio() expects the same tuple but with the elements flipped 
        return flip_tuple

    with gr.Row():
      with gr.Column(scale = 1):
          gr.HTML("""<h1 align="center"><img src="https://andergisomon.github.io/dtp-nlp-demo/huminodun_dall_e.png", alt="Video-LLaMA" border="0" style="margin: 0 auto; height: 200px;" /></a></h1>""")

          gr.Markdown("""
          **Huminodun, nulai di somit pongulai kikito DALL-E**

          *Huminodun, generated by the image generation model DALL-E*
          """)
      with gr.Column(scale = 4):
          with gr.Tab("Rolou kumaa ginarit"):
              #input = gr.components.Textbox(placeholder = "Potutakai suat nu hiti | Type something here")
              input = gr.components.Audio(source = "microphone", label = "Gakamai rolou nu")
              output = gr.components.Textbox(label = "Dalinsuat")
              button1 = gr.Button("Dalinsuato' | Transcribe")
              button1.click(run, inputs = input, outputs = output)

          with gr.Tab("Ginarit kumaa rolou"):
              input = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
              button2 = gr.Button("Poulayo'")
              output_speech = gr.components.Audio(label = "Rolou pinoulai")
              button2.click(fn2, inputs = input, outputs = output_speech)

demo.launch(debug = True)