Spaces:
Running
Running
Commit
·
9cff099
1
Parent(s):
0cea3a7
Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,6 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""ASR MMS gradio space demo.ipynb
|
3 |
-
|
4 |
-
Automatically generated by Colaboratory.
|
5 |
-
|
6 |
-
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/1TJE7dxiuXeb0nGmkc0AgFLFOnQx35ZXo
|
8 |
-
"""
|
9 |
-
|
10 |
-
#!pip install transformers
|
11 |
-
#!pip install gradio
|
12 |
import os
|
13 |
os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
|
14 |
-
os.system("pip install torch accelerate torchaudio datasets")
|
15 |
-
os.system("pip install librosa")
|
16 |
|
17 |
#NumPy 1.24 or less needed by Numba
|
18 |
os.system("pip install numpy==1.24.0")
|
@@ -21,6 +9,8 @@ import gradio as gr
|
|
21 |
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
|
22 |
from datasets import load_dataset, Audio, Dataset
|
23 |
import torch
|
|
|
|
|
24 |
|
25 |
model_id = "facebook/mms-1b-all"
|
26 |
|
@@ -32,8 +22,6 @@ model.load_adapter("dtp")
|
|
32 |
|
33 |
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
|
34 |
|
35 |
-
import librosa #For converting audio sample rate to 16k
|
36 |
-
|
37 |
def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
|
38 |
speech, sample_rate = librosa.load(input)
|
39 |
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
|
@@ -56,25 +44,58 @@ def run(input):
|
|
56 |
transcription = processor.decode(ids)
|
57 |
return transcription
|
58 |
|
59 |
-
with gr.Blocks(theme = gr.themes.Soft()) as app:
|
60 |
-
gr.Markdown(
|
61 |
-
"""
|
62 |
-
# Ponutun tuturan Boros Kadazandusun | Kadazandusun speech recognition
|
63 |
-
### Winonsoi di Ander © 2023 id Universiti Teknologi PETRONAS | Built by Ander © 2023 at Universiti Teknologi PETRONAS.
|
64 |
-
|
65 |
-
**Somit tutun tuturan** do boros Kadazandusun ii ginuno nopo nga mantad totoodo *Massive Multilingual Speech* di Meta.
|
66 |
-
|
67 |
-
Kadazandusun **automatic speech recognition model** used is from Meta's Massive Multilingual Speech project.
|
68 |
-
""")
|
69 |
-
fn = transcribe
|
70 |
-
audiofile = gr.Audio(source = "microphone", type = "filepath", label = "Dusunai oku | Say something to me in Kadazandusun")
|
71 |
-
transcription_show = gr.components.Textbox(label = "Dalinsuat | Transcription")
|
72 |
-
|
73 |
-
allow_flagging = "never"
|
74 |
-
|
75 |
-
button1 = gr.Button("Dalinsuato' | Transcribe")
|
76 |
-
button1.click(fn, inputs = audiofile, outputs = transcription_show)
|
77 |
-
|
78 |
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
|
3 |
+
os.system("pip install torch accelerate torchaudio datasets librosa easymms")
|
|
|
4 |
|
5 |
#NumPy 1.24 or less needed by Numba
|
6 |
os.system("pip install numpy==1.24.0")
|
|
|
9 |
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
|
10 |
from datasets import load_dataset, Audio, Dataset
|
11 |
import torch
|
12 |
+
import librosa #For converting audio sample rate to 16k
|
13 |
+
from easymms.models.tts import TTSModel #For TTS inference using EasyMMS
|
14 |
|
15 |
model_id = "facebook/mms-1b-all"
|
16 |
|
|
|
22 |
|
23 |
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
|
24 |
|
|
|
|
|
25 |
def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
|
26 |
speech, sample_rate = librosa.load(input)
|
27 |
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
|
|
|
44 |
transcription = processor.decode(ids)
|
45 |
return transcription
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
49 |
+
gr.HTML(
|
50 |
+
"""
|
51 |
+
<h1 align="center">Ponutun Tuturan om Pomorolou Sinuat Boros Dusun</h1>
|
52 |
+
<h5 align="center"> Poomitanan kopogunaan do somit tutun tuturan om pomorolou sinuat (speech recognition and text-to-speech models)
|
53 |
+
pinoluda' di Woyotanud Tuturan Gumukabang Tagayo di Meta (Meta Massive Multilingual Speech Project)</h5>
|
54 |
+
<h6 align = "center">Guguno (app) diti winonsoi di Ander © 2023 id Universiti Teknologi PETRONAS</h6>
|
55 |
+
<style>
|
56 |
+
.container {
|
57 |
+
display: grid;
|
58 |
+
grid-template-columns:20% 5% 20%;
|
59 |
+
align-items: center;
|
60 |
+
}
|
61 |
+
</style>
|
62 |
+
<h6 align = "center">
|
63 |
+
<div class = "container">
|
64 |
+
<div class = "image"> <a href='https://github.com/andergisomon/dtp-nlp-demo'><img src='https://img.shields.io/badge/Github-Code-success'></a> </div>
|
65 |
+
<div class = "image"></div>
|
66 |
+
<div class = "image"> <a href='https://huggingface.co/spaces/anderbogia/dtp-asr-demo-v2/'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> </div>
|
67 |
+
</div></h6>
|
68 |
+
""")
|
69 |
+
|
70 |
+
tts = TTSModel('dtp')
|
71 |
+
|
72 |
+
def fn2(input):
|
73 |
+
res = tts.synthesize(input)
|
74 |
+
flip_tuple = (res[1], res[0]) #EasyMMS synthesize() returns Tuple(data, sample_rate) where data is a numpy.array and sample_rate is int,
|
75 |
+
#but Gradio Audio() expects the same tuple but with the elements flipped
|
76 |
+
return flip_tuple
|
77 |
+
|
78 |
+
with gr.Row():
|
79 |
+
with gr.Column(scale = 1):
|
80 |
+
gr.HTML("""<h1 align="center"><img src="https://andergisomon.github.io/dtp-nlp-demo/huminodun_dall_e.png", alt="Video-LLaMA" border="0" style="margin: 0 auto; height: 200px;" /></a></h1>""")
|
81 |
+
|
82 |
+
gr.Markdown("""
|
83 |
+
**Huminodun, nulai di somit pongulai kikito DALL-E**
|
84 |
+
|
85 |
+
*Huminodun, generated by the image generation model DALL-E*
|
86 |
+
""")
|
87 |
+
with gr.Column(scale = 4):
|
88 |
+
with gr.Tab("Rolou kumaa ginarit"):
|
89 |
+
#input = gr.components.Textbox(placeholder = "Potutakai suat nu hiti | Type something here")
|
90 |
+
input = gr.components.Audio(source = "microphone", label = "Gakamai rolou nu")
|
91 |
+
output = gr.components.Textbox(label = "Dalinsuat")
|
92 |
+
button1 = gr.Button("Dalinsuato' | Transcribe")
|
93 |
+
button1.click(run, inputs = input, outputs = output)
|
94 |
+
|
95 |
+
with gr.Tab("Ginarit kumaa rolou"):
|
96 |
+
input = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
|
97 |
+
button2 = gr.Button("Poulayo'")
|
98 |
+
output_speech = gr.components.Audio(label = "Rolou pinoulai")
|
99 |
+
button2.click(fn2, inputs = input, outputs = output_speech)
|
100 |
+
|
101 |
+
demo.launch(debug = True)
|