anderbogia commited on
Commit
9cff099
·
1 Parent(s): 0cea3a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -36
app.py CHANGED
@@ -1,18 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
- """ASR MMS gradio space demo.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1TJE7dxiuXeb0nGmkc0AgFLFOnQx35ZXo
8
- """
9
-
10
- #!pip install transformers
11
- #!pip install gradio
12
  import os
13
  os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
14
- os.system("pip install torch accelerate torchaudio datasets")
15
- os.system("pip install librosa")
16
 
17
  #NumPy 1.24 or less needed by Numba
18
  os.system("pip install numpy==1.24.0")
@@ -21,6 +9,8 @@ import gradio as gr
21
  from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
22
  from datasets import load_dataset, Audio, Dataset
23
  import torch
 
 
24
 
25
  model_id = "facebook/mms-1b-all"
26
 
@@ -32,8 +22,6 @@ model.load_adapter("dtp")
32
 
33
  asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
34
 
35
- import librosa #For converting audio sample rate to 16k
36
-
37
  def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
38
  speech, sample_rate = librosa.load(input)
39
  speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
@@ -56,25 +44,58 @@ def run(input):
56
  transcription = processor.decode(ids)
57
  return transcription
58
 
59
- with gr.Blocks(theme = gr.themes.Soft()) as app:
60
- gr.Markdown(
61
- """
62
- # Ponutun tuturan Boros Kadazandusun | Kadazandusun speech recognition
63
- ### Winonsoi di Ander © 2023 id Universiti Teknologi PETRONAS | Built by Ander © 2023 at Universiti Teknologi PETRONAS.
64
-
65
- **Somit tutun tuturan** do boros Kadazandusun ii ginuno nopo nga mantad totoodo *Massive Multilingual Speech* di Meta.
66
-
67
- Kadazandusun **automatic speech recognition model** used is from Meta's Massive Multilingual Speech project.
68
- """)
69
- fn = transcribe
70
- audiofile = gr.Audio(source = "microphone", type = "filepath", label = "Dusunai oku | Say something to me in Kadazandusun")
71
- transcription_show = gr.components.Textbox(label = "Dalinsuat | Transcription")
72
-
73
- allow_flagging = "never"
74
-
75
- button1 = gr.Button("Dalinsuato' | Transcribe")
76
- button1.click(fn, inputs = audiofile, outputs = transcription_show)
77
-
78
 
79
- if __name__ == "__main__":
80
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
3
+ os.system("pip install torch accelerate torchaudio datasets librosa easymms")
 
4
 
5
  #NumPy 1.24 or less needed by Numba
6
  os.system("pip install numpy==1.24.0")
 
9
  from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
10
  from datasets import load_dataset, Audio, Dataset
11
  import torch
12
+ import librosa #For converting audio sample rate to 16k
13
+ from easymms.models.tts import TTSModel #For TTS inference using EasyMMS
14
 
15
  model_id = "facebook/mms-1b-all"
16
 
 
22
 
23
  asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
24
 
 
 
25
  def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
26
  speech, sample_rate = librosa.load(input)
27
  speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
 
44
  transcription = processor.decode(ids)
45
  return transcription
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ with gr.Blocks(theme = gr.themes.Soft()) as demo:
49
+ gr.HTML(
50
+ """
51
+ <h1 align="center">Ponutun Tuturan om Pomorolou Sinuat Boros Dusun</h1>
52
+ <h5 align="center"> Poomitanan kopogunaan do somit tutun tuturan om pomorolou sinuat (speech recognition and text-to-speech models)
53
+ pinoluda' di Woyotanud Tuturan Gumukabang Tagayo di Meta (Meta Massive Multilingual Speech Project)</h5>
54
+ <h6 align = "center">Guguno (app) diti winonsoi di Ander © 2023 id Universiti Teknologi PETRONAS</h6>
55
+ <style>
56
+ .container {
57
+ display: grid;
58
+ grid-template-columns:20% 5% 20%;
59
+ align-items: center;
60
+ }
61
+ </style>
62
+ <h6 align = "center">
63
+ <div class = "container">
64
+ <div class = "image"> <a href='https://github.com/andergisomon/dtp-nlp-demo'><img src='https://img.shields.io/badge/Github-Code-success'></a> </div>
65
+ <div class = "image"></div>
66
+ <div class = "image"> <a href='https://huggingface.co/spaces/anderbogia/dtp-asr-demo-v2/'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> </div>
67
+ </div></h6>
68
+ """)
69
+
70
+ tts = TTSModel('dtp')
71
+
72
+ def fn2(input):
73
+ res = tts.synthesize(input)
74
+ flip_tuple = (res[1], res[0]) #EasyMMS synthesize() returns Tuple(data, sample_rate) where data is a numpy.array and sample_rate is int,
75
+ #but Gradio Audio() expects the same tuple but with the elements flipped
76
+ return flip_tuple
77
+
78
+ with gr.Row():
79
+ with gr.Column(scale = 1):
80
+ gr.HTML("""<h1 align="center"><img src="https://andergisomon.github.io/dtp-nlp-demo/huminodun_dall_e.png", alt="Video-LLaMA" border="0" style="margin: 0 auto; height: 200px;" /></a></h1>""")
81
+
82
+ gr.Markdown("""
83
+ **Huminodun, nulai di somit pongulai kikito DALL-E**
84
+
85
+ *Huminodun, generated by the image generation model DALL-E*
86
+ """)
87
+ with gr.Column(scale = 4):
88
+ with gr.Tab("Rolou kumaa ginarit"):
89
+ #input = gr.components.Textbox(placeholder = "Potutakai suat nu hiti | Type something here")
90
+ input = gr.components.Audio(source = "microphone", label = "Gakamai rolou nu")
91
+ output = gr.components.Textbox(label = "Dalinsuat")
92
+ button1 = gr.Button("Dalinsuato' | Transcribe")
93
+ button1.click(run, inputs = input, outputs = output)
94
+
95
+ with gr.Tab("Ginarit kumaa rolou"):
96
+ input = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
97
+ button2 = gr.Button("Poulayo'")
98
+ output_speech = gr.components.Audio(label = "Rolou pinoulai")
99
+ button2.click(fn2, inputs = input, outputs = output_speech)
100
+
101
+ demo.launch(debug = True)