Robertomarting commited on
Commit
30bf287
verified
1 Parent(s): 86a85b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -28
app.py CHANGED
@@ -106,6 +106,36 @@ def clear_audio_input(audio):
106
  Monitor
107
  '''
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  #Sacamos extractor de caracter铆sticas:
110
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
111
  #Y nuestro modelo:
@@ -130,41 +160,33 @@ def preprocess_audio_monitor(audio_segments):
130
 
131
  #Funci贸n de predicci贸n en streaming:
132
  def predict_audio_stream(audio_data, sample_rate):
133
-
134
- audio_segments = process_audio(audio_data)
135
- inputs = preprocess_audio_monitor(audio_segments)
136
 
 
 
137
  with torch.no_grad():
138
  outputs = model_monitor(**inputs)
139
-
140
  logits = outputs.logits
141
  probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
142
  crying_probabilities = probabilities[:, 1]
143
  avg_crying_probability = crying_probabilities.mean()
144
-
145
- if avg_crying_probability < 0.15:
146
- return "Est谩 llorando", avg_crying_probability
147
- else:
148
- return "No est谩 llorando", avg_crying_probability
149
-
150
- #Funci贸n que realiza la predicci贸n
151
- def continuous_prediction_with_status(audio, sample_rate=16000,duration=3):
152
- audio_segments = []
153
- start_time = time.time()
154
-
155
- max_samples = sample_rate * duration
156
- audio_data = audio[:max_samples]
157
 
158
- result = predict_audio_stream(audio_data, sample_rate)
159
-
160
- return result
161
-
162
- def capture_and_predict(audio, sample_rate=16000, duration=5):
163
- max_samples = sample_rate * duration
164
- audio_data = audio[:max_samples]
 
 
 
 
165
 
166
- result, probabilidad = predict_audio_stream(audio_data, sample_rate)
167
- return f"Predicci贸n: {result}, Probabilidad: {probabilidad:.2f}", probabilidad
 
168
 
169
  #Funci贸n que se encarga de indicarle al usuario si se ha pasado el umbral:
170
  def update_status_to_predicting(audio, visual_threshold):
@@ -177,7 +199,29 @@ def update_status_to_predicting(audio, visual_threshold):
177
  return f"Esperando... Decibelios: {db_level}"
178
  else:
179
  return f"Prediciendo... Decibelios: {db_level}"
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  '''
182
  Asistente
183
  '''
@@ -345,7 +389,7 @@ with gr.Blocks(theme = my_theme) as demo:
345
 
346
  audio_stream = gr.Audio(sources=["microphone"], streaming=True)
347
 
348
- threshold_db = gr.Slider(minimum=0, maximum=200, step=1, value=20, label="Umbral de dB para activar la predicci贸n")
349
 
350
  status_label = gr.Textbox(value="Esperando...", label="Estado")
351
  prediction_label = gr.Textbox(label="Predicci贸n")
@@ -359,7 +403,7 @@ with gr.Blocks(theme = my_theme) as demo:
359
  # Captura el audio y realiza la predicci贸n si se supera el umbral
360
  audio_stream.stream(
361
  fn=capture_and_predict,
362
- inputs=audio_stream,
363
  outputs=prediction_label
364
  )
365
 
 
106
  Monitor
107
  '''
108
 
109
+ def process_audio_monitor(audio_tuple, target_sr=16000, target_duration=1.0):
110
+ data = []
111
+ target_length = int(target_sr * target_duration)
112
+
113
+ wav_buffer = io.BytesIO()
114
+ sf.write(wav_buffer, audio_tuple[1], audio_tuple[0], format='wav')
115
+
116
+ wav_buffer.seek(0)
117
+ audio_data, sample_rate = sf.read(wav_buffer)
118
+
119
+ audio_data = audio_data.astype(np.float32)
120
+
121
+ if len(audio_data.shape) > 1:
122
+ audio_data = np.mean(audio_data, axis=1)
123
+
124
+ if sample_rate != target_sr:
125
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=target_sr)
126
+
127
+ audio_data, _ = librosa.effects.trim(audio_data)
128
+
129
+ if len(audio_data) > target_length:
130
+ for i in range(0, len(audio_data), target_length):
131
+ segment = audio_data[i:i + target_length]
132
+ if len(segment) == target_length:
133
+ data.append(segment)
134
+ else:
135
+ data.append(audio_data)
136
+
137
+ return data
138
+
139
  #Sacamos extractor de caracter铆sticas:
140
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
141
  #Y nuestro modelo:
 
160
 
161
  #Funci贸n de predicci贸n en streaming:
162
  def predict_audio_stream(audio_data, sample_rate):
163
+ audio_segments = process_audio_monitor(audio_data)
 
 
164
 
165
+ inputs = preprocess_audio_monitor(audio_segments)
166
+
167
  with torch.no_grad():
168
  outputs = model_monitor(**inputs)
169
+
170
  logits = outputs.logits
171
  probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
172
  crying_probabilities = probabilities[:, 1]
173
  avg_crying_probability = crying_probabilities.mean()
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ if avg_crying_probability < 0.25:
176
+
177
+ inputs = preprocess_audio(audio_segments)
178
+ with torch.no_grad():
179
+ outputs = model(**inputs)
180
+ logits = outputs.logits
181
+ probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
182
+ predicted_classes = probabilities.argmax(axis=1)
183
+ most_common_predicted_label = Counter(predicted_classes).most_common(1)[0][0]
184
+ replace_dict = {0: 'Hambre', 1: 'Problemas para respirar', 2: 'Dolor', 3: 'Cansancio/Incomodidad'}
185
+ most_common_predicted_label = replace_dict[most_common_predicted_label]
186
 
187
+ return "Est谩 llorando", 1-avg_crying_probability, most_common_predicted_label
188
+ else:
189
+ return "No est谩 llorando", 1-avg_crying_probability, ""
190
 
191
  #Funci贸n que se encarga de indicarle al usuario si se ha pasado el umbral:
192
  def update_status_to_predicting(audio, visual_threshold):
 
199
  return f"Esperando... Decibelios: {db_level}"
200
  else:
201
  return f"Prediciendo... Decibelios: {db_level}"
202
+ time.sleep(5)
203
 
204
+ #Funci贸n que realiza la predicci贸n
205
+ def capture_and_predict(audio,visual_threshold, sample_rate=16000, duration=5):
206
+
207
+ sample_rate, audio_data = audio
208
+ audio_data = np.array(audio_data, dtype=np.float32)
209
+ db_level = compute_db(audio_data)
210
+
211
+ if db_level > visual_threshold:
212
+ max_samples = sample_rate * duration
213
+ audio_data = audio[:max_samples]
214
+ if len(audio_data) != 0:
215
+ result, probabilidad, result_2 = predict_audio_stream(audio_data, sample_rate)
216
+ if result == "Est谩 llorando":
217
+ return f"{result}, por {result_2}"
218
+ time.sleep(10)
219
+ else:
220
+ return "No est谩 llorando"
221
+ time.sleep(5)
222
+ else:
223
+ time.sleep(1)
224
+
225
  '''
226
  Asistente
227
  '''
 
389
 
390
  audio_stream = gr.Audio(sources=["microphone"], streaming=True)
391
 
392
+ threshold_db = gr.Slider(minimum=0, maximum=200, step=1, value=50, label="Umbral de dB para activar la predicci贸n")
393
 
394
  status_label = gr.Textbox(value="Esperando...", label="Estado")
395
  prediction_label = gr.Textbox(label="Predicci贸n")
 
403
  # Captura el audio y realiza la predicci贸n si se supera el umbral
404
  audio_stream.stream(
405
  fn=capture_and_predict,
406
+ inputs=[audio_stream,threshold_db],
407
  outputs=prediction_label
408
  )
409