rbiojout
/

whisperX-endpoint

Model card Files Files and versions

xet

Community

raphaelbiojout commited on Nov 20, 2023

Commit

a0580e3

1 Parent(s): 25edf8d

update handler

Browse files

Files changed (1) hide show

handler.py +57 -1

handler.py CHANGED Viewed

@@ -8,6 +8,8 @@ import base64
 import subprocess
 import numpy as np
 # from transformers.pipelines.audio_utils import ffmpeg_read
 from typing import Dict, List, Any
@@ -26,6 +28,52 @@ def whisper_config():
     compute_type = "float16" if device == "cuda" else "int8"
     return device, batch_size, compute_type, whisper_model
 def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
     """
     Helper function to read an audio file through ffmpeg.
@@ -186,8 +234,16 @@ class EndpointHandler():
                 language = parameters["language"]
             inputs = base64.b64decode(inputs_encoded)
-            audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
             # audio_tensor= torch.from_numpy(audio_nparray)
             results = []

 import subprocess
 import numpy as np
+DEVNULL = open(os.devnull, 'w')
 # from transformers.pipelines.audio_utils import ffmpeg_read
 from typing import Dict, List, Any
     compute_type = "float16" if device == "cuda" else "int8"
     return device, batch_size, compute_type, whisper_model
+# load_audio can not detect the input type
+def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np.int16, out_type=np.float32):
+    channels = 1 if mono else 2
+    format_strings = {
+        np.float64: 'f64le',
+        np.float32: 'f32le',
+        np.int16: 's16le',
+        np.int32: 's32le',
+        np.uint32: 'u32le'
+    }
+    format_string = format_strings[in_type]
+    command = [
+        'ffmpeg',
+        '-i', filename,
+        '-f', format_string,
+        '-acodec', 'pcm_' + format_string,
+        '-ar', str(sr),
+        '-ac', str(channels),
+        '-']
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=DEVNULL, bufsize=4096)
+    bytes_per_sample = np.dtype(in_type).itemsize
+    frame_size = bytes_per_sample * channels
+    chunk_size = frame_size * sr # read in 1-second chunks
+    raw = b''
+    with p.stdout as stdout:
+        while True:
+            data = stdout.read(chunk_size)
+            if data:
+                raw += data
+            else:
+                break
+    audio = np.fromstring(raw, dtype=in_type).astype(out_type)
+    if channels > 1:
+        audio = audio.reshape((-1, channels)).transpose()
+    if audio.size == 0:
+        return audio, sr
+    if issubclass(out_type, np.floating):
+        if normalize:
+            peak = np.abs(audio).max()
+            if peak > 0:
+                audio /= peak
+        elif issubclass(in_type, np.integer):
+            audio /= np.iinfo(in_type).max
+    return audio
 def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
     """
     Helper function to read an audio file through ffmpeg.
                 language = parameters["language"]
             inputs = base64.b64decode(inputs_encoded)
+            # make a tmp file
+            with open('/tmp/myfile.tmp', 'wb') as w:
+                w.write(inputs)
+            # audio_nparray = ffmpeg_load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE, mono=True, out_type=np.float32)
+            audio_nparray = load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE)
+            # clean up
+            os.remove('/tmp/myfile.tmp')
+            # audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
             # audio_tensor= torch.from_numpy(audio_nparray)
             results = []