whisper-jax

Build error

App Files Files Community

sanchit-gandhi commited on Apr 12, 2023

Commit

c220da3

1 Parent(s): 24fab16

try un-batched

Browse files

Files changed (1) hide show

app.py +8 -22

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ language_names = sorted(TO_LANGUAGE_CODE.keys())
 CHUNK_LENGTH_S = 30
 BATCH_SIZE = 16
 NUM_PROC = 8
 FILE_LIMIT_MB = 1000
@@ -70,10 +71,7 @@ def forward(batch, task=None, return_timestamps=False):
 if __name__ == "__main__":
-    processor = WhisperPrePostProcessor.from_pretrained("openai/whisper-large-v2")
-    pool = Pool(NUM_PROC)
-    def transcribe_chunked_audio(microphone, file_upload, task, return_timestamps):
         warn_output = ""
         if (microphone is not None) and (file_upload is not None):
             warn_output = (
@@ -82,31 +80,19 @@ if __name__ == "__main__":
             )
         elif (microphone is None) and (file_upload is None):
-            return "ERROR: You have to either use the microphone or upload an audio file", None
         inputs = microphone if microphone is not None else file_upload
-        file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
-        if file_size_mb > FILE_LIMIT_MB:
-            return f"ERROR: File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB.", None
         with open(inputs, "rb") as f:
             inputs = f.read()
-        inputs = ffmpeg_read(inputs, processor.feature_extractor.sampling_rate)
-        inputs = {"array": inputs, "sampling_rate": processor.feature_extractor.sampling_rate}
-        dataloader = processor.preprocess_batch(inputs, chunk_length_s=CHUNK_LENGTH_S, batch_size=BATCH_SIZE)
-        try:
-            model_outputs = pool.map(partial(forward, task=task, return_timestamps=return_timestamps), dataloader)
-        except ValueError as err:
-            # pre-processor does all the necessary compatibility checks for our audio inputs
-            return err, None
-        post_processed = processor.postprocess(model_outputs, return_timestamps=return_timestamps)
-        timestamps = post_processed.get("chunks")
-        return warn_output + post_processed["text"], timestamps
     def _return_yt_html_embed(yt_url):
         video_id = yt_url.split("?v=")[-1]
@@ -124,7 +110,7 @@ if __name__ == "__main__":
         return html_embed_str, text, timestamps
     audio_chunked = gr.Interface(
-        fn=transcribe_chunked_audio,
         inputs=[
             gr.inputs.Audio(source="microphone", optional=True, type="filepath"),
             gr.inputs.Audio(source="upload", optional=True, type="filepath"),

 CHUNK_LENGTH_S = 30
 BATCH_SIZE = 16
 NUM_PROC = 8
+SAMPLING_RATE = 16000
 FILE_LIMIT_MB = 1000
 if __name__ == "__main__":
+    def transcribe_audio(microphone, file_upload, task, return_timestamps):
         warn_output = ""
         if (microphone is not None) and (file_upload is not None):
             warn_output = (
             )
         elif (microphone is None) and (file_upload is None):
+            return "ERROR: You have to either use the microphone or upload an audio file"
         inputs = microphone if microphone is not None else file_upload
         with open(inputs, "rb") as f:
             inputs = f.read()
+        inputs = ffmpeg_read(inputs, SAMPLING_RATE)
+        inputs = {"array": base64.b64encode(inputs.tobytes()).decode(), "sampling_rate": SAMPLING_RATE}
+        text, timestamps = inference(inputs=inputs, task=task, return_timestamps=return_timestamps)
+        return warn_output + text, timestamps
     def _return_yt_html_embed(yt_url):
         video_id = yt_url.split("?v=")[-1]
         return html_embed_str, text, timestamps
     audio_chunked = gr.Interface(
+        fn=transcribe_audio,
         inputs=[
             gr.inputs.Audio(source="microphone", optional=True, type="filepath"),
             gr.inputs.Audio(source="upload", optional=True, type="filepath"),