raphaelbiojout
commited on
Commit
·
a7da197
1
Parent(s):
459b0c9
update
Browse files- handler.py +30 -20
handler.py
CHANGED
@@ -192,9 +192,6 @@ class EndpointHandler():
|
|
192 |
Return:
|
193 |
A :obj:`dict`:. base64 encoded image
|
194 |
"""
|
195 |
-
for x in data.keys():
|
196 |
-
logger.info(f"key: {x}, value: {data[x]} ")
|
197 |
-
print(f"key: {x}, value: {data[x]} ")
|
198 |
|
199 |
logger.info("--------------- CUDA ------------------------")
|
200 |
logger.info(display_gpu_infos())
|
@@ -202,6 +199,16 @@ class EndpointHandler():
|
|
202 |
# 1. process input
|
203 |
inputs_encoded = data.pop("inputs", data)
|
204 |
parameters = data.pop("parameters", None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
language = "fr"
|
207 |
if parameters and "language" in parameters.keys():
|
@@ -220,35 +227,38 @@ class EndpointHandler():
|
|
220 |
# audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
|
221 |
# audio_tensor= torch.from_numpy(audio_nparray)
|
222 |
|
223 |
-
results = []
|
224 |
-
|
225 |
# 2. transcribe
|
226 |
device, batch_size, compute_type, whisper_model = whisper_config()
|
227 |
logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
|
228 |
transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
|
229 |
-
|
230 |
-
|
231 |
logger.info(transcription["segments"])
|
232 |
|
233 |
# 3. align
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
241 |
|
242 |
# 4. Assign speaker labels
|
243 |
logger.info("--------------- STARTING DIARIZATION ------------------------")
|
244 |
# add min/max number of speakers if known
|
245 |
-
|
246 |
-
|
|
|
|
|
247 |
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
252 |
|
253 |
if torch.cuda.is_available():
|
254 |
logger.info("--------------- GPU ------------------------")
|
@@ -259,7 +269,7 @@ class EndpointHandler():
|
|
259 |
|
260 |
# results_json = json.dumps(results)
|
261 |
# return {"results": results_json}
|
262 |
-
return
|
263 |
|
264 |
|
265 |
|
|
|
192 |
Return:
|
193 |
A :obj:`dict`:. base64 encoded image
|
194 |
"""
|
|
|
|
|
|
|
195 |
|
196 |
logger.info("--------------- CUDA ------------------------")
|
197 |
logger.info(display_gpu_infos())
|
|
|
199 |
# 1. process input
|
200 |
inputs_encoded = data.pop("inputs", data)
|
201 |
parameters = data.pop("parameters", None)
|
202 |
+
options = data.pop("options", None)
|
203 |
+
|
204 |
+
# OPTIONS
|
205 |
+
info = False
|
206 |
+
if options and "info" in options.keys():
|
207 |
+
info = True
|
208 |
+
|
209 |
+
alignment = False
|
210 |
+
if options and "alignment" in options.keys():
|
211 |
+
info = True
|
212 |
|
213 |
language = "fr"
|
214 |
if parameters and "language" in parameters.keys():
|
|
|
227 |
# audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
|
228 |
# audio_tensor= torch.from_numpy(audio_nparray)
|
229 |
|
|
|
|
|
230 |
# 2. transcribe
|
231 |
device, batch_size, compute_type, whisper_model = whisper_config()
|
232 |
logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
|
233 |
transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
|
234 |
+
if info:
|
235 |
+
print(transcription["segments"]) # before alignment
|
236 |
logger.info(transcription["segments"])
|
237 |
|
238 |
# 3. align
|
239 |
+
if alignment:
|
240 |
+
logger.info("--------------- STARTING ALIGNMENT ------------------------")
|
241 |
+
model_a, metadata = whisperx.load_align_model(
|
242 |
+
language_code=transcription["language"], device=device)
|
243 |
+
transcription = whisperx.align(
|
244 |
+
transcription["segments"], model_a, metadata, audio_nparray, device, return_char_alignments=False)
|
245 |
+
if info:
|
246 |
+
print(transcription["segments"])
|
247 |
+
logger.info(transcription["segments"])
|
248 |
|
249 |
# 4. Assign speaker labels
|
250 |
logger.info("--------------- STARTING DIARIZATION ------------------------")
|
251 |
# add min/max number of speakers if known
|
252 |
+
diarize_segments = self.diarize_model(audio_nparray)
|
253 |
+
if info:
|
254 |
+
print(diarize_segments)
|
255 |
+
logger.info(diarize_segments)
|
256 |
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
|
257 |
|
258 |
+
transcription = whisperx.assign_word_speakers(diarize_segments, transcription)
|
259 |
+
if info:
|
260 |
+
print(transcription["segments"])
|
261 |
+
logger.info(transcription["segments"]) # segments are now assigned speaker IDs
|
262 |
|
263 |
if torch.cuda.is_available():
|
264 |
logger.info("--------------- GPU ------------------------")
|
|
|
269 |
|
270 |
# results_json = json.dumps(results)
|
271 |
# return {"results": results_json}
|
272 |
+
return {"transcription": transcription["segments"]}
|
273 |
|
274 |
|
275 |
|