Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,17 +15,26 @@ from collections import Counter
|
|
15 |
from scipy.stats import kurtosis
|
16 |
from huggingface_hub import InferenceClient
|
17 |
import os
|
|
|
18 |
|
|
|
|
|
|
|
|
|
|
|
19 |
access_token_mod_1 = os.getenv('HF_Access_Personal')
|
20 |
|
21 |
-
#
|
22 |
processor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
|
23 |
model = AutoModelForAudioClassification.from_pretrained("Robertomarting/tmp_trainer",token=access_token_mod_1)
|
24 |
|
|
|
25 |
def is_white_noise(audio, threshold=0.75):
|
26 |
kurt = kurtosis(audio)
|
27 |
return np.abs(kurt) < 0.1 and np.mean(np.abs(audio)) < threshold
|
28 |
|
|
|
|
|
29 |
def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
|
30 |
data = []
|
31 |
target_length = int(target_sr * target_duration)
|
@@ -57,6 +66,7 @@ def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
|
|
57 |
|
58 |
return data
|
59 |
|
|
|
60 |
def preprocess_audio(audio_segments):
|
61 |
inputs = processor(
|
62 |
audio_segments,
|
@@ -64,32 +74,26 @@ def preprocess_audio(audio_segments):
|
|
64 |
sampling_rate=processor.sampling_rate,
|
65 |
max_length=int(processor.sampling_rate * 1),
|
66 |
truncation=True,
|
67 |
-
return_tensors="pt"
|
68 |
)
|
69 |
return inputs
|
70 |
|
|
|
71 |
def predict_audio(audio):
|
72 |
-
# Procesar el audio y obtener las listas de numpy
|
73 |
-
audio_segments = process_audio(audio)
|
74 |
|
75 |
-
|
|
|
76 |
inputs = preprocess_audio(audio_segments)
|
77 |
|
78 |
-
# Realizar las predicciones
|
79 |
with torch.no_grad():
|
80 |
outputs = model(**inputs)
|
81 |
|
82 |
-
# Obtener los logits de las predicciones
|
83 |
logits = outputs.logits
|
84 |
-
|
85 |
-
# Convertir logits a probabilidades
|
86 |
probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
|
87 |
predicted_classes = probabilities.argmax(axis=1)
|
88 |
|
89 |
-
# Obtener la etiqueta m谩s com煤n
|
90 |
most_common_predicted_label = Counter(predicted_classes).most_common(1)[0][0]
|
91 |
-
|
92 |
-
# Mapear etiquetas num茅ricas a etiquetas de texto
|
93 |
replace_dict = {0: 'Hambre', 1: 'Problemas para respirar', 2: 'Dolor', 3: 'Cansancio/Incomodidad'}
|
94 |
most_common_predicted_label = replace_dict[most_common_predicted_label]
|
95 |
|
@@ -98,10 +102,93 @@ def predict_audio(audio):
|
|
98 |
def clear_audio_input(audio):
|
99 |
return ""
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
access_token = os.getenv('HF_ACCESS_TOKEN')
|
102 |
|
|
|
103 |
client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=access_token)
|
104 |
|
|
|
105 |
def respond(
|
106 |
message,
|
107 |
history: list[tuple[str, str]],
|
@@ -133,6 +220,11 @@ def respond(
|
|
133 |
response += token
|
134 |
yield response
|
135 |
|
|
|
|
|
|
|
|
|
|
|
136 |
my_theme = gr.themes.Soft(
|
137 |
primary_hue="emerald",
|
138 |
secondary_hue="green",
|
@@ -150,54 +242,19 @@ my_theme = gr.themes.Soft(
|
|
150 |
shadow_spread='*button_shadow_active'
|
151 |
)
|
152 |
|
153 |
-
#
|
154 |
def mostrar_pagina_1():
|
155 |
return gr.update(visible=False), gr.update(visible=True)
|
156 |
|
157 |
-
#
|
158 |
def mostrar_pagina_2():
|
159 |
return gr.update(visible=False), gr.update(visible=True)
|
160 |
|
161 |
-
#
|
162 |
def redirigir_a_pantalla_inicial():
|
163 |
return gr.update(visible=True), gr.update(visible=False)
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
#processor = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
|
168 |
-
|
169 |
-
#monitor_model = HubertForSequenceClassification.from_pretrained("A-POR-LOS-8000/distilhubert-finetuned-cry-detector",use_auth_token=access_token_mod_1)
|
170 |
-
|
171 |
-
pipeline_monitor = pipeline(model="Robertomarting/tmp_trainer",token=access_token_mod_1,feature_extractor=processor)
|
172 |
-
|
173 |
-
def predict_monitor(stream, new_chunk):
|
174 |
-
sr, y = new_chunk
|
175 |
-
y = y.astype(np.float32)
|
176 |
-
y /= np.max(np.abs(y))
|
177 |
-
|
178 |
-
if stream is not None:
|
179 |
-
stream = np.concatenate([stream, y])
|
180 |
-
else:
|
181 |
-
stream = y
|
182 |
-
return stream, pipeline_monitor(stream)
|
183 |
-
|
184 |
-
my_theme = gr.themes.Soft(
|
185 |
-
primary_hue="emerald",
|
186 |
-
secondary_hue="green",
|
187 |
-
neutral_hue="slate",
|
188 |
-
text_size="sm",
|
189 |
-
spacing_size="sm",
|
190 |
-
font=[gr.themes.GoogleFont('Nunito'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
|
191 |
-
font_mono=[gr.themes.GoogleFont('Nunito'), 'ui-monospace', 'Consolas', 'monospace'],
|
192 |
-
).set(
|
193 |
-
body_background_fill='*neutral_50',
|
194 |
-
body_text_color='*neutral_600',
|
195 |
-
body_text_size='*text_sm',
|
196 |
-
embed_radius='*radius_md',
|
197 |
-
shadow_drop='*shadow_spread',
|
198 |
-
shadow_spread='*button_shadow_active'
|
199 |
-
)
|
200 |
-
|
201 |
with gr.Blocks(theme = my_theme) as demo:
|
202 |
|
203 |
with gr.Column() as pantalla_inicial:
|
@@ -261,7 +318,7 @@ with gr.Blocks(theme = my_theme) as demo:
|
|
261 |
|
262 |
with gr.Column():
|
263 |
gr.Markdown("<h2>Assistant</h2>")
|
264 |
-
system_message = "
|
265 |
max_tokens = 512
|
266 |
temperature = 0.7
|
267 |
top_p = 0.95
|
@@ -282,23 +339,30 @@ with gr.Blocks(theme = my_theme) as demo:
|
|
282 |
boton_volver_inicio_1.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
283 |
|
284 |
with gr.Column(visible=False) as pagina_2:
|
|
|
285 |
gr.Markdown("<h2>Monitor</h2>")
|
286 |
-
gr.Markdown("
|
287 |
|
288 |
-
|
289 |
-
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Habla cerca del micr贸fono")
|
290 |
-
|
291 |
-
# Salida del texto donde se muestra la predicci贸n
|
292 |
-
output_text = gr.Textbox(label="Resultado de la predicci贸n")
|
293 |
-
|
294 |
-
# Vincular la predicci贸n en streaming con el audio
|
295 |
-
audio_input.stream(fn=lambda audio: predict_monitor(audio, audio_classifier),
|
296 |
-
inputs=audio_input,
|
297 |
-
outputs=output_text)
|
298 |
|
299 |
-
|
300 |
-
boton_volver_inicio_2.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_2])
|
301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
boton_pagina_1.click(mostrar_pagina_1, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
303 |
boton_pagina_2.click(mostrar_pagina_2, inputs=None, outputs=[pantalla_inicial, pagina_2])
|
304 |
|
|
|
15 |
from scipy.stats import kurtosis
|
16 |
from huggingface_hub import InferenceClient
|
17 |
import os
|
18 |
+
import time
|
19 |
|
20 |
+
'''
|
21 |
+
Predictor
|
22 |
+
'''
|
23 |
+
|
24 |
+
#Obtenemos el token para traernos el modelo:
|
25 |
access_token_mod_1 = os.getenv('HF_Access_Personal')
|
26 |
|
27 |
+
#Cargamos procesador y modelo:
|
28 |
processor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
|
29 |
model = AutoModelForAudioClassification.from_pretrained("Robertomarting/tmp_trainer",token=access_token_mod_1)
|
30 |
|
31 |
+
#Definimos una funci贸n para eliminar segmentos de audio con un determinado porcentaje de ruido blanco:
|
32 |
def is_white_noise(audio, threshold=0.75):
|
33 |
kurt = kurtosis(audio)
|
34 |
return np.abs(kurt) < 0.1 and np.mean(np.abs(audio)) < threshold
|
35 |
|
36 |
+
#Funci贸n de procesado de audio, permite particionar en fragmentos de 1 segundo, hacer un trim, volverlo mono si est谩 en est茅reo, resamplearlo
|
37 |
+
#al sampling rate que admite el modelo, etc.
|
38 |
def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
|
39 |
data = []
|
40 |
target_length = int(target_sr * target_duration)
|
|
|
66 |
|
67 |
return data
|
68 |
|
69 |
+
#Se aplica al extractor de caracter铆sticas del modelo:
|
70 |
def preprocess_audio(audio_segments):
|
71 |
inputs = processor(
|
72 |
audio_segments,
|
|
|
74 |
sampling_rate=processor.sampling_rate,
|
75 |
max_length=int(processor.sampling_rate * 1),
|
76 |
truncation=True,
|
77 |
+
return_tensors="pt"
|
78 |
)
|
79 |
return inputs
|
80 |
|
81 |
+
#Se hace la predicci贸n para cada audio:
|
82 |
def predict_audio(audio):
|
|
|
|
|
83 |
|
84 |
+
audio_segments = process_audio(audio)
|
85 |
+
|
86 |
inputs = preprocess_audio(audio_segments)
|
87 |
|
|
|
88 |
with torch.no_grad():
|
89 |
outputs = model(**inputs)
|
90 |
|
|
|
91 |
logits = outputs.logits
|
|
|
|
|
92 |
probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
|
93 |
predicted_classes = probabilities.argmax(axis=1)
|
94 |
|
|
|
95 |
most_common_predicted_label = Counter(predicted_classes).most_common(1)[0][0]
|
96 |
+
|
|
|
97 |
replace_dict = {0: 'Hambre', 1: 'Problemas para respirar', 2: 'Dolor', 3: 'Cansancio/Incomodidad'}
|
98 |
most_common_predicted_label = replace_dict[most_common_predicted_label]
|
99 |
|
|
|
102 |
def clear_audio_input(audio):
|
103 |
return ""
|
104 |
|
105 |
+
'''
|
106 |
+
Monitor
|
107 |
+
'''
|
108 |
+
|
109 |
+
#Sacamos extractor de caracter铆sticas:
|
110 |
+
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
|
111 |
+
#Y nuestro modelo:
|
112 |
+
model_monitor = HubertForSequenceClassification.from_pretrained("A-POR-LOS-8000/distilhubert-finetuned-cry-detector")
|
113 |
+
|
114 |
+
#Calculamos decibelios de lo que llega al gradio:
|
115 |
+
def compute_db(audio_data):
|
116 |
+
rms = np.sqrt(np.mean(np.square(audio_data)))
|
117 |
+
db = 20 * np.log10(rms + 1e-6)
|
118 |
+
return db
|
119 |
+
|
120 |
+
#Funci贸n de extracci贸n de caracter铆sticas para el monitor:
|
121 |
+
def preprocess_audio_monitor(audio_segments):
|
122 |
+
inputs = FEATURE_EXTRACTOR(
|
123 |
+
audio_segments,
|
124 |
+
padding=True,
|
125 |
+
sampling_rate=16000,
|
126 |
+
max_length=int(16000*1),
|
127 |
+
return_tensors="pt"
|
128 |
+
)
|
129 |
+
return inputs
|
130 |
+
|
131 |
+
#Funci贸n de predicci贸n en streaming:
|
132 |
+
def predict_audio_stream(audio_data, sample_rate):
|
133 |
+
|
134 |
+
audio_segments = process_audio(audio_data)
|
135 |
+
inputs = preprocess_audio_monitor(audio_segments)
|
136 |
+
|
137 |
+
with torch.no_grad():
|
138 |
+
outputs = model_monitor(**inputs)
|
139 |
+
|
140 |
+
logits = outputs.logits
|
141 |
+
probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
|
142 |
+
crying_probabilities = probabilities[:, 1]
|
143 |
+
avg_crying_probability = crying_probabilities.mean()
|
144 |
+
|
145 |
+
if avg_crying_probability < 0.15:
|
146 |
+
return "Est谩 llorando", avg_crying_probability
|
147 |
+
else:
|
148 |
+
return "No est谩 llorando", avg_crying_probability
|
149 |
+
|
150 |
+
#Funci贸n que realiza la predicci贸n
|
151 |
+
def continuous_prediction_with_status(audio, sample_rate=16000,duration=3):
|
152 |
+
audio_segments = []
|
153 |
+
start_time = time.time()
|
154 |
+
|
155 |
+
max_samples = sample_rate * duration
|
156 |
+
audio_data = audio[:max_samples]
|
157 |
+
|
158 |
+
result = predict_audio_stream(audio_data, sample_rate)
|
159 |
+
|
160 |
+
return result
|
161 |
+
|
162 |
+
def capture_and_predict(audio, sample_rate=16000, duration=5):
|
163 |
+
max_samples = sample_rate * duration
|
164 |
+
audio_data = audio[:max_samples]
|
165 |
+
|
166 |
+
result, probabilidad = predict_audio_stream(audio_data, sample_rate)
|
167 |
+
return f"Predicci贸n: {result}, Probabilidad: {probabilidad:.2f}", probabilidad
|
168 |
+
|
169 |
+
#Funci贸n que se encarga de indicarle al usuario si se ha pasado el umbral:
|
170 |
+
def update_status_to_predicting(audio, visual_threshold):
|
171 |
+
sample_rate, audio_data = audio
|
172 |
+
audio_data = np.array(audio_data, dtype=np.float32)
|
173 |
+
|
174 |
+
db_level = compute_db(audio_data)
|
175 |
+
|
176 |
+
if db_level < visual_threshold:
|
177 |
+
return f"Esperando... Decibelios: {db_level}"
|
178 |
+
else:
|
179 |
+
return f"Prediciendo... Decibelios: {db_level}"
|
180 |
+
|
181 |
+
'''
|
182 |
+
Asistente
|
183 |
+
'''
|
184 |
+
|
185 |
+
#Traemos el token:
|
186 |
access_token = os.getenv('HF_ACCESS_TOKEN')
|
187 |
|
188 |
+
#Generamos el cliente:
|
189 |
client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=access_token)
|
190 |
|
191 |
+
#Generamos una funci贸n de respuesta:
|
192 |
def respond(
|
193 |
message,
|
194 |
history: list[tuple[str, str]],
|
|
|
220 |
response += token
|
221 |
yield response
|
222 |
|
223 |
+
'''
|
224 |
+
Interfaz
|
225 |
+
'''
|
226 |
+
|
227 |
+
#Generamos un theme con par谩metros personalizados:
|
228 |
my_theme = gr.themes.Soft(
|
229 |
primary_hue="emerald",
|
230 |
secondary_hue="green",
|
|
|
242 |
shadow_spread='*button_shadow_active'
|
243 |
)
|
244 |
|
245 |
+
#Funci贸n para mostrar la p谩gina del Predictor
|
246 |
def mostrar_pagina_1():
|
247 |
return gr.update(visible=False), gr.update(visible=True)
|
248 |
|
249 |
+
#Funci贸n para mostrar la p谩gina del Monitor
|
250 |
def mostrar_pagina_2():
|
251 |
return gr.update(visible=False), gr.update(visible=True)
|
252 |
|
253 |
+
#Funci贸n para regresar a la pantalla inicial
|
254 |
def redirigir_a_pantalla_inicial():
|
255 |
return gr.update(visible=True), gr.update(visible=False)
|
256 |
|
257 |
+
#Generamos el gradio:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
with gr.Blocks(theme = my_theme) as demo:
|
259 |
|
260 |
with gr.Column() as pantalla_inicial:
|
|
|
318 |
|
319 |
with gr.Column():
|
320 |
gr.Markdown("<h2>Assistant</h2>")
|
321 |
+
system_message = "Eres un chatbot especializado en el cuidado y la salud de los beb茅s. Est谩s dispuesto a ayudar amablemente a cualquier padre que tenga dudas o preocupaciones sobre su hijo o hija."
|
322 |
max_tokens = 512
|
323 |
temperature = 0.7
|
324 |
top_p = 0.95
|
|
|
339 |
boton_volver_inicio_1.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
340 |
|
341 |
with gr.Column(visible=False) as pagina_2:
|
342 |
+
|
343 |
gr.Markdown("<h2>Monitor</h2>")
|
344 |
+
gr.Markdown("<h4 style='text-align: center; font-size: 1.5em'>Detecci贸n en tiempo real del llanto del beb茅</h4>")
|
345 |
|
346 |
+
audio_stream = gr.Audio(sources=["microphone"], streaming=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
+
threshold_db = gr.Slider(minimum=0, maximum=200, step=1, value=20, label="Umbral de dB para activar la predicci贸n")
|
|
|
349 |
|
350 |
+
status_label = gr.Textbox(value="Esperando...", label="Estado")
|
351 |
+
prediction_label = gr.Textbox(label="Predicci贸n")
|
352 |
+
|
353 |
+
audio_stream.stream(
|
354 |
+
fn=update_status_to_predicting,
|
355 |
+
inputs=[audio_stream, threshold_db],
|
356 |
+
outputs=status_label
|
357 |
+
)
|
358 |
+
|
359 |
+
# Captura el audio y realiza la predicci贸n si se supera el umbral
|
360 |
+
audio_stream.stream(
|
361 |
+
fn=capture_and_predict,
|
362 |
+
inputs=audio_stream,
|
363 |
+
outputs=prediction_label
|
364 |
+
)
|
365 |
+
|
366 |
boton_pagina_1.click(mostrar_pagina_1, inputs=None, outputs=[pantalla_inicial, pagina_1])
|
367 |
boton_pagina_2.click(mostrar_pagina_2, inputs=None, outputs=[pantalla_inicial, pagina_2])
|
368 |
|