Spaces:

sanchit-gandhi
/

whisper-jax

Running

App Files Files Community

225

sanchit-gandhi commited on Apr 5, 2023

Commit

af74e64

1 Parent(s): a1c0e65

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -19

app.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import gradio as gr
 import requests
-import pytube
 from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
-from transformers.pipelines.audio_utils import ffmpeg_read
-title = "Whisper JAX: The Fastest Whisper API Available ⚡️"
-description = """Whisper JAX is an optimised implementation of the [Whisper model](https://huggingface.co/openai/whisper-large-v2) by OpenAI. It runs on JAX with a TPU v4-8 in the backend. Compared to PyTorch on an A100 GPU, it is over **12x** faster, making it the fastest Whisper API available.
-You can submit requests to Whisper JAX through this Gradio Demo, or directly through API calls (see below). This notebook demonstrates how you can run the Whisper JAX model yourself on a TPU v2-8 in a Google Colab: TODO.
-"""
 API_URL = "https://whisper-jax.ngrok.io/generate/"
@@ -24,9 +21,13 @@ def query(payload):
     return response.json(), response.status_code
-def inference(inputs, task, return_timestamps):
     payload = {"inputs": inputs, "task": task, "return_timestamps": return_timestamps}
     data, status_code = query(payload)
     if status_code == 200:
@@ -72,22 +73,12 @@ def _return_yt_html_embed(yt_url):
 def transcribe_youtube(yt_url, task, return_timestamps):
-    yt = pytube.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
-    stream = yt.streams.filter(only_audio=True)[0]
-    stream.download(filename="audio.mp3")
-    with open("audio.mp3", "rb") as f:
-        inputs = f.read()
-    inputs = ffmpeg_read(inputs, SAMPLING_RATE)
-    inputs = {"array": inputs.tolist(), "sampling_rate": SAMPLING_RATE}
-    yield html_embed_str, "Video loaded, transcribing audio...", None
-    text, timestamps = inference(inputs=inputs, task=task, return_timestamps=return_timestamps)
-    yield html_embed_str, text, timestamps
 audio = gr.Interface(
     fn=transcribe_audio,

 import gradio as gr
 import requests
 from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
+title = "Whisper JAX: The Fastest Whisper API Available ⚡️"
+description = "Whisper JAX is an optimised implementation of the [Whisper model](https://huggingface.co/openai/whisper-large-v2) by OpenAI. It runs on JAX with a TPU v4-8 in the backend. Compared to PyTorch on an A100 GPU, it is over **12x** faster, making it the fastest Whisper API available."
+#description += "\nYou can submit requests to Whisper JAX through this Gradio Demo, or directly through API calls (see below). This notebook demonstrates how you can run the Whisper JAX model yourself on a TPU v2-8 in a Google Colab: TODO."
 API_URL = "https://whisper-jax.ngrok.io/generate/"
     return response.json(), response.status_code
+def inference(inputs, language=None, task=None, return_timestamps=False):
     payload = {"inputs": inputs, "task": task, "return_timestamps": return_timestamps}
+    # langauge can come as an empty string from the Gradio `None` default, so we handle it separately
+    if language:
+        payload["language"] = language
     data, status_code = query(payload)
     if status_code == 200:
 def transcribe_youtube(yt_url, task, return_timestamps):
     html_embed_str = _return_yt_html_embed(yt_url)
+    text, timestamps = inference(inputs=yt_url, task=task, return_timestamps=return_timestamps)
+    return html_embed_str, text, timestamps
 audio = gr.Interface(
     fn=transcribe_audio,