raphaelbiojout commited on
Commit
a7da197
·
1 Parent(s): 459b0c9
Files changed (1) hide show
  1. handler.py +30 -20
handler.py CHANGED
@@ -192,9 +192,6 @@ class EndpointHandler():
192
  Return:
193
  A :obj:`dict`:. base64 encoded image
194
  """
195
- for x in data.keys():
196
- logger.info(f"key: {x}, value: {data[x]} ")
197
- print(f"key: {x}, value: {data[x]} ")
198
 
199
  logger.info("--------------- CUDA ------------------------")
200
  logger.info(display_gpu_infos())
@@ -202,6 +199,16 @@ class EndpointHandler():
202
  # 1. process input
203
  inputs_encoded = data.pop("inputs", data)
204
  parameters = data.pop("parameters", None)
 
 
 
 
 
 
 
 
 
 
205
 
206
  language = "fr"
207
  if parameters and "language" in parameters.keys():
@@ -220,35 +227,38 @@ class EndpointHandler():
220
  # audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
221
  # audio_tensor= torch.from_numpy(audio_nparray)
222
 
223
- results = []
224
-
225
  # 2. transcribe
226
  device, batch_size, compute_type, whisper_model = whisper_config()
227
  logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
228
  transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
229
- results.append({"transcription": transcription["segments"]})
230
-
231
  logger.info(transcription["segments"])
232
 
233
  # 3. align
234
- logger.info("--------------- STARTING ALIGNMENT ------------------------")
235
- # model_a, metadata = whisperx.load_align_model(
236
- # language_code=result["language"], device=device)
237
- # transcription = whisperx.align(
238
- # result["segments"], model_a, metadata, audio_nparray, device, return_char_alignments=False)
239
- # results.append({"aligned_transcription": transcription["segments"]})
240
- # print(transcription["segments"])
 
 
241
 
242
  # 4. Assign speaker labels
243
  logger.info("--------------- STARTING DIARIZATION ------------------------")
244
  # add min/max number of speakers if known
245
- #diarize_segments = self.diarize_model(audio_nparray)
246
- #logger.info(diarize_segments)
 
 
247
  # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
248
 
249
- #diarized_transcription = whisperx.assign_word_speakers(diarize_segments, transcription)
250
- #logger.info(diarized_transcription["segments"]) # segments are now assigned speaker IDs
251
- #results.append({"diarized_transcription": diarized_transcription["segments"]})
 
252
 
253
  if torch.cuda.is_available():
254
  logger.info("--------------- GPU ------------------------")
@@ -259,7 +269,7 @@ class EndpointHandler():
259
 
260
  # results_json = json.dumps(results)
261
  # return {"results": results_json}
262
- return results
263
 
264
 
265
 
 
192
  Return:
193
  A :obj:`dict`:. base64 encoded image
194
  """
 
 
 
195
 
196
  logger.info("--------------- CUDA ------------------------")
197
  logger.info(display_gpu_infos())
 
199
  # 1. process input
200
  inputs_encoded = data.pop("inputs", data)
201
  parameters = data.pop("parameters", None)
202
+ options = data.pop("options", None)
203
+
204
+ # OPTIONS
205
+ info = False
206
+ if options and "info" in options.keys():
207
+ info = True
208
+
209
+ alignment = False
210
+ if options and "alignment" in options.keys():
211
+ info = True
212
 
213
  language = "fr"
214
  if parameters and "language" in parameters.keys():
 
227
  # audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
228
  # audio_tensor= torch.from_numpy(audio_nparray)
229
 
 
 
230
  # 2. transcribe
231
  device, batch_size, compute_type, whisper_model = whisper_config()
232
  logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
233
  transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
234
+ if info:
235
+ print(transcription["segments"]) # before alignment
236
  logger.info(transcription["segments"])
237
 
238
  # 3. align
239
+ if alignment:
240
+ logger.info("--------------- STARTING ALIGNMENT ------------------------")
241
+ model_a, metadata = whisperx.load_align_model(
242
+ language_code=transcription["language"], device=device)
243
+ transcription = whisperx.align(
244
+ transcription["segments"], model_a, metadata, audio_nparray, device, return_char_alignments=False)
245
+ if info:
246
+ print(transcription["segments"])
247
+ logger.info(transcription["segments"])
248
 
249
  # 4. Assign speaker labels
250
  logger.info("--------------- STARTING DIARIZATION ------------------------")
251
  # add min/max number of speakers if known
252
+ diarize_segments = self.diarize_model(audio_nparray)
253
+ if info:
254
+ print(diarize_segments)
255
+ logger.info(diarize_segments)
256
  # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
257
 
258
+ transcription = whisperx.assign_word_speakers(diarize_segments, transcription)
259
+ if info:
260
+ print(transcription["segments"])
261
+ logger.info(transcription["segments"]) # segments are now assigned speaker IDs
262
 
263
  if torch.cuda.is_available():
264
  logger.info("--------------- GPU ------------------------")
 
269
 
270
  # results_json = json.dumps(results)
271
  # return {"results": results_json}
272
+ return {"transcription": transcription["segments"]}
273
 
274
 
275