Robertomarting commited on
Commit
86a85b1
verified
1 Parent(s): 642e7e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -65
app.py CHANGED
@@ -15,17 +15,26 @@ from collections import Counter
15
  from scipy.stats import kurtosis
16
  from huggingface_hub import InferenceClient
17
  import os
 
18
 
 
 
 
 
 
19
  access_token_mod_1 = os.getenv('HF_Access_Personal')
20
 
21
- # Cargar el procesador y modelo
22
  processor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
23
  model = AutoModelForAudioClassification.from_pretrained("Robertomarting/tmp_trainer",token=access_token_mod_1)
24
 
 
25
  def is_white_noise(audio, threshold=0.75):
26
  kurt = kurtosis(audio)
27
  return np.abs(kurt) < 0.1 and np.mean(np.abs(audio)) < threshold
28
 
 
 
29
  def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
30
  data = []
31
  target_length = int(target_sr * target_duration)
@@ -57,6 +66,7 @@ def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
57
 
58
  return data
59
 
 
60
  def preprocess_audio(audio_segments):
61
  inputs = processor(
62
  audio_segments,
@@ -64,32 +74,26 @@ def preprocess_audio(audio_segments):
64
  sampling_rate=processor.sampling_rate,
65
  max_length=int(processor.sampling_rate * 1),
66
  truncation=True,
67
- return_tensors="pt" # Directamente retorna tensores de PyTorch
68
  )
69
  return inputs
70
 
 
71
  def predict_audio(audio):
72
- # Procesar el audio y obtener las listas de numpy
73
- audio_segments = process_audio(audio)
74
 
75
- # Preprocesar el audio (aplica directamente al array numpy)
 
76
  inputs = preprocess_audio(audio_segments)
77
 
78
- # Realizar las predicciones
79
  with torch.no_grad():
80
  outputs = model(**inputs)
81
 
82
- # Obtener los logits de las predicciones
83
  logits = outputs.logits
84
-
85
- # Convertir logits a probabilidades
86
  probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
87
  predicted_classes = probabilities.argmax(axis=1)
88
 
89
- # Obtener la etiqueta m谩s com煤n
90
  most_common_predicted_label = Counter(predicted_classes).most_common(1)[0][0]
91
-
92
- # Mapear etiquetas num茅ricas a etiquetas de texto
93
  replace_dict = {0: 'Hambre', 1: 'Problemas para respirar', 2: 'Dolor', 3: 'Cansancio/Incomodidad'}
94
  most_common_predicted_label = replace_dict[most_common_predicted_label]
95
 
@@ -98,10 +102,93 @@ def predict_audio(audio):
98
  def clear_audio_input(audio):
99
  return ""
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  access_token = os.getenv('HF_ACCESS_TOKEN')
102
 
 
103
  client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=access_token)
104
 
 
105
  def respond(
106
  message,
107
  history: list[tuple[str, str]],
@@ -133,6 +220,11 @@ def respond(
133
  response += token
134
  yield response
135
 
 
 
 
 
 
136
  my_theme = gr.themes.Soft(
137
  primary_hue="emerald",
138
  secondary_hue="green",
@@ -150,54 +242,19 @@ my_theme = gr.themes.Soft(
150
  shadow_spread='*button_shadow_active'
151
  )
152
 
153
- # Funci贸n para mostrar la p谩gina 1
154
  def mostrar_pagina_1():
155
  return gr.update(visible=False), gr.update(visible=True)
156
 
157
- # Funci贸n para mostrar la p谩gina 2
158
  def mostrar_pagina_2():
159
  return gr.update(visible=False), gr.update(visible=True)
160
 
161
- # Funci贸n para regresar a la pantalla inicial
162
  def redirigir_a_pantalla_inicial():
163
  return gr.update(visible=True), gr.update(visible=False)
164
 
165
- ### Monitor
166
-
167
- #processor = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
168
-
169
- #monitor_model = HubertForSequenceClassification.from_pretrained("A-POR-LOS-8000/distilhubert-finetuned-cry-detector",use_auth_token=access_token_mod_1)
170
-
171
- pipeline_monitor = pipeline(model="Robertomarting/tmp_trainer",token=access_token_mod_1,feature_extractor=processor)
172
-
173
- def predict_monitor(stream, new_chunk):
174
- sr, y = new_chunk
175
- y = y.astype(np.float32)
176
- y /= np.max(np.abs(y))
177
-
178
- if stream is not None:
179
- stream = np.concatenate([stream, y])
180
- else:
181
- stream = y
182
- return stream, pipeline_monitor(stream)
183
-
184
- my_theme = gr.themes.Soft(
185
- primary_hue="emerald",
186
- secondary_hue="green",
187
- neutral_hue="slate",
188
- text_size="sm",
189
- spacing_size="sm",
190
- font=[gr.themes.GoogleFont('Nunito'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
191
- font_mono=[gr.themes.GoogleFont('Nunito'), 'ui-monospace', 'Consolas', 'monospace'],
192
- ).set(
193
- body_background_fill='*neutral_50',
194
- body_text_color='*neutral_600',
195
- body_text_size='*text_sm',
196
- embed_radius='*radius_md',
197
- shadow_drop='*shadow_spread',
198
- shadow_spread='*button_shadow_active'
199
- )
200
-
201
  with gr.Blocks(theme = my_theme) as demo:
202
 
203
  with gr.Column() as pantalla_inicial:
@@ -261,7 +318,7 @@ with gr.Blocks(theme = my_theme) as demo:
261
 
262
  with gr.Column():
263
  gr.Markdown("<h2>Assistant</h2>")
264
- system_message = "You are a Chatbot specialized in baby health and care."
265
  max_tokens = 512
266
  temperature = 0.7
267
  top_p = 0.95
@@ -282,23 +339,30 @@ with gr.Blocks(theme = my_theme) as demo:
282
  boton_volver_inicio_1.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_1])
283
 
284
  with gr.Column(visible=False) as pagina_2:
 
285
  gr.Markdown("<h2>Monitor</h2>")
286
- gr.Markdown("# Detecci贸n en tiempo real del llanto del beb茅 con Pipeline")
287
 
288
- # Componente de audio en streaming
289
- audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Habla cerca del micr贸fono")
290
-
291
- # Salida del texto donde se muestra la predicci贸n
292
- output_text = gr.Textbox(label="Resultado de la predicci贸n")
293
-
294
- # Vincular la predicci贸n en streaming con el audio
295
- audio_input.stream(fn=lambda audio: predict_monitor(audio, audio_classifier),
296
- inputs=audio_input,
297
- outputs=output_text)
298
 
299
- boton_volver_inicio_2 = gr.Button("Volver a la pantalla inicial")
300
- boton_volver_inicio_2.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_2])
301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  boton_pagina_1.click(mostrar_pagina_1, inputs=None, outputs=[pantalla_inicial, pagina_1])
303
  boton_pagina_2.click(mostrar_pagina_2, inputs=None, outputs=[pantalla_inicial, pagina_2])
304
 
 
15
  from scipy.stats import kurtosis
16
  from huggingface_hub import InferenceClient
17
  import os
18
+ import time
19
 
20
+ '''
21
+ Predictor
22
+ '''
23
+
24
+ #Obtenemos el token para traernos el modelo:
25
  access_token_mod_1 = os.getenv('HF_Access_Personal')
26
 
27
+ #Cargamos procesador y modelo:
28
  processor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
29
  model = AutoModelForAudioClassification.from_pretrained("Robertomarting/tmp_trainer",token=access_token_mod_1)
30
 
31
+ #Definimos una funci贸n para eliminar segmentos de audio con un determinado porcentaje de ruido blanco:
32
  def is_white_noise(audio, threshold=0.75):
33
  kurt = kurtosis(audio)
34
  return np.abs(kurt) < 0.1 and np.mean(np.abs(audio)) < threshold
35
 
36
+ #Funci贸n de procesado de audio, permite particionar en fragmentos de 1 segundo, hacer un trim, volverlo mono si est谩 en est茅reo, resamplearlo
37
+ #al sampling rate que admite el modelo, etc.
38
  def process_audio(audio_tuple, target_sr=16000, target_duration=1.0):
39
  data = []
40
  target_length = int(target_sr * target_duration)
 
66
 
67
  return data
68
 
69
+ #Se aplica al extractor de caracter铆sticas del modelo:
70
  def preprocess_audio(audio_segments):
71
  inputs = processor(
72
  audio_segments,
 
74
  sampling_rate=processor.sampling_rate,
75
  max_length=int(processor.sampling_rate * 1),
76
  truncation=True,
77
+ return_tensors="pt"
78
  )
79
  return inputs
80
 
81
+ #Se hace la predicci贸n para cada audio:
82
  def predict_audio(audio):
 
 
83
 
84
+ audio_segments = process_audio(audio)
85
+
86
  inputs = preprocess_audio(audio_segments)
87
 
 
88
  with torch.no_grad():
89
  outputs = model(**inputs)
90
 
 
91
  logits = outputs.logits
 
 
92
  probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
93
  predicted_classes = probabilities.argmax(axis=1)
94
 
 
95
  most_common_predicted_label = Counter(predicted_classes).most_common(1)[0][0]
96
+
 
97
  replace_dict = {0: 'Hambre', 1: 'Problemas para respirar', 2: 'Dolor', 3: 'Cansancio/Incomodidad'}
98
  most_common_predicted_label = replace_dict[most_common_predicted_label]
99
 
 
102
  def clear_audio_input(audio):
103
  return ""
104
 
105
+ '''
106
+ Monitor
107
+ '''
108
+
109
+ #Sacamos extractor de caracter铆sticas:
110
+ FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
111
+ #Y nuestro modelo:
112
+ model_monitor = HubertForSequenceClassification.from_pretrained("A-POR-LOS-8000/distilhubert-finetuned-cry-detector")
113
+
114
+ #Calculamos decibelios de lo que llega al gradio:
115
+ def compute_db(audio_data):
116
+ rms = np.sqrt(np.mean(np.square(audio_data)))
117
+ db = 20 * np.log10(rms + 1e-6)
118
+ return db
119
+
120
+ #Funci贸n de extracci贸n de caracter铆sticas para el monitor:
121
+ def preprocess_audio_monitor(audio_segments):
122
+ inputs = FEATURE_EXTRACTOR(
123
+ audio_segments,
124
+ padding=True,
125
+ sampling_rate=16000,
126
+ max_length=int(16000*1),
127
+ return_tensors="pt"
128
+ )
129
+ return inputs
130
+
131
+ #Funci贸n de predicci贸n en streaming:
132
+ def predict_audio_stream(audio_data, sample_rate):
133
+
134
+ audio_segments = process_audio(audio_data)
135
+ inputs = preprocess_audio_monitor(audio_segments)
136
+
137
+ with torch.no_grad():
138
+ outputs = model_monitor(**inputs)
139
+
140
+ logits = outputs.logits
141
+ probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
142
+ crying_probabilities = probabilities[:, 1]
143
+ avg_crying_probability = crying_probabilities.mean()
144
+
145
+ if avg_crying_probability < 0.15:
146
+ return "Est谩 llorando", avg_crying_probability
147
+ else:
148
+ return "No est谩 llorando", avg_crying_probability
149
+
150
+ #Funci贸n que realiza la predicci贸n
151
+ def continuous_prediction_with_status(audio, sample_rate=16000,duration=3):
152
+ audio_segments = []
153
+ start_time = time.time()
154
+
155
+ max_samples = sample_rate * duration
156
+ audio_data = audio[:max_samples]
157
+
158
+ result = predict_audio_stream(audio_data, sample_rate)
159
+
160
+ return result
161
+
162
+ def capture_and_predict(audio, sample_rate=16000, duration=5):
163
+ max_samples = sample_rate * duration
164
+ audio_data = audio[:max_samples]
165
+
166
+ result, probabilidad = predict_audio_stream(audio_data, sample_rate)
167
+ return f"Predicci贸n: {result}, Probabilidad: {probabilidad:.2f}", probabilidad
168
+
169
+ #Funci贸n que se encarga de indicarle al usuario si se ha pasado el umbral:
170
+ def update_status_to_predicting(audio, visual_threshold):
171
+ sample_rate, audio_data = audio
172
+ audio_data = np.array(audio_data, dtype=np.float32)
173
+
174
+ db_level = compute_db(audio_data)
175
+
176
+ if db_level < visual_threshold:
177
+ return f"Esperando... Decibelios: {db_level}"
178
+ else:
179
+ return f"Prediciendo... Decibelios: {db_level}"
180
+
181
+ '''
182
+ Asistente
183
+ '''
184
+
185
+ #Traemos el token:
186
  access_token = os.getenv('HF_ACCESS_TOKEN')
187
 
188
+ #Generamos el cliente:
189
  client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=access_token)
190
 
191
+ #Generamos una funci贸n de respuesta:
192
  def respond(
193
  message,
194
  history: list[tuple[str, str]],
 
220
  response += token
221
  yield response
222
 
223
+ '''
224
+ Interfaz
225
+ '''
226
+
227
+ #Generamos un theme con par谩metros personalizados:
228
  my_theme = gr.themes.Soft(
229
  primary_hue="emerald",
230
  secondary_hue="green",
 
242
  shadow_spread='*button_shadow_active'
243
  )
244
 
245
+ #Funci贸n para mostrar la p谩gina del Predictor
246
  def mostrar_pagina_1():
247
  return gr.update(visible=False), gr.update(visible=True)
248
 
249
+ #Funci贸n para mostrar la p谩gina del Monitor
250
  def mostrar_pagina_2():
251
  return gr.update(visible=False), gr.update(visible=True)
252
 
253
+ #Funci贸n para regresar a la pantalla inicial
254
  def redirigir_a_pantalla_inicial():
255
  return gr.update(visible=True), gr.update(visible=False)
256
 
257
+ #Generamos el gradio:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  with gr.Blocks(theme = my_theme) as demo:
259
 
260
  with gr.Column() as pantalla_inicial:
 
318
 
319
  with gr.Column():
320
  gr.Markdown("<h2>Assistant</h2>")
321
+ system_message = "Eres un chatbot especializado en el cuidado y la salud de los beb茅s. Est谩s dispuesto a ayudar amablemente a cualquier padre que tenga dudas o preocupaciones sobre su hijo o hija."
322
  max_tokens = 512
323
  temperature = 0.7
324
  top_p = 0.95
 
339
  boton_volver_inicio_1.click(redirigir_a_pantalla_inicial, inputs=None, outputs=[pantalla_inicial, pagina_1])
340
 
341
  with gr.Column(visible=False) as pagina_2:
342
+
343
  gr.Markdown("<h2>Monitor</h2>")
344
+ gr.Markdown("<h4 style='text-align: center; font-size: 1.5em'>Detecci贸n en tiempo real del llanto del beb茅</h4>")
345
 
346
+ audio_stream = gr.Audio(sources=["microphone"], streaming=True)
 
 
 
 
 
 
 
 
 
347
 
348
+ threshold_db = gr.Slider(minimum=0, maximum=200, step=1, value=20, label="Umbral de dB para activar la predicci贸n")
 
349
 
350
+ status_label = gr.Textbox(value="Esperando...", label="Estado")
351
+ prediction_label = gr.Textbox(label="Predicci贸n")
352
+
353
+ audio_stream.stream(
354
+ fn=update_status_to_predicting,
355
+ inputs=[audio_stream, threshold_db],
356
+ outputs=status_label
357
+ )
358
+
359
+ # Captura el audio y realiza la predicci贸n si se supera el umbral
360
+ audio_stream.stream(
361
+ fn=capture_and_predict,
362
+ inputs=audio_stream,
363
+ outputs=prediction_label
364
+ )
365
+
366
  boton_pagina_1.click(mostrar_pagina_1, inputs=None, outputs=[pantalla_inicial, pagina_1])
367
  boton_pagina_2.click(mostrar_pagina_2, inputs=None, outputs=[pantalla_inicial, pagina_2])
368