Spaces:
Sleeping
Sleeping
remove mic
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
from transformers import Wav2Vec2FeatureExtractor
|
4 |
from transformers import AutoModel
|
5 |
import torch
|
@@ -18,7 +18,6 @@ import importlib
|
|
18 |
modeling_MERT = importlib.import_module("MERT-v1-95M.modeling_MERT")
|
19 |
|
20 |
from Prediction_Head.MTGGenre_head import MLPProberBase
|
21 |
-
# input cr: https://huggingface.co/spaces/thealphhamerc/audio-to-text/blob/main/app.py
|
22 |
|
23 |
|
24 |
logger = logging.getLogger("MERT-v1-95M-app")
|
@@ -34,13 +33,9 @@ inputs = [
|
|
34 |
gr.components.Audio(type="filepath", label="Add music audio file"),
|
35 |
gr.inputs.Audio(source="microphone", type="filepath"),
|
36 |
]
|
37 |
-
live_inputs = [
|
38 |
-
gr.Audio(source="microphone",streaming=True, type="filepath"),
|
39 |
-
]
|
40 |
|
41 |
title = "One Model for All Music Understanding Tasks"
|
42 |
description = "An example of using the [MERT-v1-95M](https://huggingface.co/m-a-p/MERT-v1-95M) model as backbone to conduct multiple music understanding tasks with the universal representation. \n Due the hardware limitation of the machine hosting this demo (2 CPU and 16GB RAM) only the first 4 seconds of audio are used!"
|
43 |
-
# article = "The tasks include EMO, GS, MTGInstrument, MTGGenre, MTGTop50, MTGMood, NSynthI, NSynthP, VocalSetS, VocalSetT. \n\n More models can be referred at the [map organization page](https://huggingface.co/m-a-p)."
|
44 |
with open('./README.md', 'r') as f:
|
45 |
# skip the header
|
46 |
header_count = 0
|
@@ -52,11 +47,6 @@ with open('./README.md', 'r') as f:
|
|
52 |
# read the rest conent
|
53 |
article = f.read()
|
54 |
|
55 |
-
audio_examples = [
|
56 |
-
# ["input/example-1.wav"],
|
57 |
-
# ["input/example-2.wav"],
|
58 |
-
]
|
59 |
-
|
60 |
df_init = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3', 'Top 4', 'Top 5'])
|
61 |
transcription_df = gr.DataFrame(value=df_init, label="Output Dataframe", row_count=(
|
62 |
0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
|
@@ -128,7 +118,7 @@ for task in TASKS:
|
|
128 |
model.to(device)
|
129 |
|
130 |
|
131 |
-
def
|
132 |
waveform, sample_rate = torchaudio.load(inputs)
|
133 |
|
134 |
resample_rate = processor.sampling_rate
|
@@ -194,50 +184,20 @@ def model_infernce(inputs):
|
|
194 |
def convert_audio(inputs, microphone):
|
195 |
if (microphone is not None):
|
196 |
inputs = microphone
|
197 |
-
df =
|
198 |
return df
|
199 |
|
200 |
-
def live_convert_audio(microphone):
|
201 |
-
if (microphone is not None):
|
202 |
-
inputs = microphone
|
203 |
-
df = model_infernce(inputs)
|
204 |
-
return df
|
205 |
-
|
206 |
-
audio_chunked = gr.Interface(
|
207 |
-
fn=convert_audio,
|
208 |
-
inputs=inputs,
|
209 |
-
outputs=outputs,
|
210 |
-
allow_flagging="never",
|
211 |
-
title=title,
|
212 |
-
description=description,
|
213 |
-
article=article,
|
214 |
-
examples=audio_examples,
|
215 |
-
)
|
216 |
-
|
217 |
-
live_audio_chunked = gr.Interface(
|
218 |
-
fn=live_convert_audio,
|
219 |
-
inputs=live_inputs,
|
220 |
-
outputs=outputs_live,
|
221 |
-
allow_flagging="never",
|
222 |
-
title=title,
|
223 |
-
description=description,
|
224 |
-
article=article,
|
225 |
-
# examples=audio_examples,
|
226 |
-
live=True,
|
227 |
-
)
|
228 |
-
|
229 |
-
|
230 |
demo = gr.Blocks()
|
231 |
with demo:
|
232 |
-
gr.
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
]
|
241 |
)
|
|
|
242 |
# demo.queue(concurrency_count=1, max_size=5)
|
243 |
-
demo.launch(
|
|
|
1 |
import gradio as gr
|
2 |
+
|
3 |
from transformers import Wav2Vec2FeatureExtractor
|
4 |
from transformers import AutoModel
|
5 |
import torch
|
|
|
18 |
modeling_MERT = importlib.import_module("MERT-v1-95M.modeling_MERT")
|
19 |
|
20 |
from Prediction_Head.MTGGenre_head import MLPProberBase
|
|
|
21 |
|
22 |
|
23 |
logger = logging.getLogger("MERT-v1-95M-app")
|
|
|
33 |
gr.components.Audio(type="filepath", label="Add music audio file"),
|
34 |
gr.inputs.Audio(source="microphone", type="filepath"),
|
35 |
]
|
|
|
|
|
|
|
36 |
|
37 |
title = "One Model for All Music Understanding Tasks"
|
38 |
description = "An example of using the [MERT-v1-95M](https://huggingface.co/m-a-p/MERT-v1-95M) model as backbone to conduct multiple music understanding tasks with the universal representation. \n Due the hardware limitation of the machine hosting this demo (2 CPU and 16GB RAM) only the first 4 seconds of audio are used!"
|
|
|
39 |
with open('./README.md', 'r') as f:
|
40 |
# skip the header
|
41 |
header_count = 0
|
|
|
47 |
# read the rest conent
|
48 |
article = f.read()
|
49 |
|
|
|
|
|
|
|
|
|
|
|
50 |
df_init = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3', 'Top 4', 'Top 5'])
|
51 |
transcription_df = gr.DataFrame(value=df_init, label="Output Dataframe", row_count=(
|
52 |
0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
|
|
|
118 |
model.to(device)
|
119 |
|
120 |
|
121 |
+
def model_inference(inputs):
|
122 |
waveform, sample_rate = torchaudio.load(inputs)
|
123 |
|
124 |
resample_rate = processor.sampling_rate
|
|
|
184 |
def convert_audio(inputs, microphone):
|
185 |
if (microphone is not None):
|
186 |
inputs = microphone
|
187 |
+
df = model_inference(inputs)
|
188 |
return df
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
demo = gr.Blocks()
|
191 |
with demo:
|
192 |
+
gr.Interface(
|
193 |
+
fn=convert_audio,
|
194 |
+
inputs=inputs,
|
195 |
+
outputs=outputs,
|
196 |
+
allow_flagging="never",
|
197 |
+
title=title,
|
198 |
+
description=description,
|
199 |
+
article=article,
|
|
|
200 |
)
|
201 |
+
|
202 |
# demo.queue(concurrency_count=1, max_size=5)
|
203 |
+
demo.launch()
|