Spaces:

kanyekuthi
/

kanyekuthi-AfriSpeech-whisper-tiny

Runtime error

App Files Files Community

kanyekuthi commited on Nov 7, 2023

Commit

20960bc

1 Parent(s): bb3d9e1

Update app.py

Browse files

Files changed (1) hide show

app.py +336 -27

app.py CHANGED Viewed

@@ -1,37 +1,346 @@
-import os
-# os.system("pip install git+https://github.com/openai/whisper.git")
 # import gradio as gr
-# import whisper
-# from huggingface_hub import from_pretrained_keras
-# from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# from transformers import pipeline
-# from sklearn.preprocessing import StandardScaler
-# import logging
-# import librosa
-# import numpy as np
-# import pickle
 import gradio as gr
-# gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny").launch()
-# gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
-# gr.launch()
-distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
-transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
-demo = gr.Interface(
-    # main_note,
-    gr.Audio(sources=["microphone"]),
-    # gr.Label(num_top_classes=4),
-    # examples=[
-    #     [os.path.join(os.path.dirname(__file__),"audio/recording1.wav")],
-    #     [os.path.join(os.path.dirname(__file__),"audio/cantina.wav")],
-    # ],
-    outputs=[distil_transcription, transcription]
 )
-demo.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
 if __name__ == "__main__":
-    demo.launch()

+# import os
+# # os.system("pip install git+https://github.com/openai/whisper.git")
+# # import gradio as gr
+# # import whisper
+# # from huggingface_hub import from_pretrained_keras
+# # from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# # from transformers import pipeline
+# # from sklearn.preprocessing import StandardScaler
+# # import logging
+# # import librosa
+# # import numpy as np
+# # import pickle
 # import gradio as gr
+# # gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny").launch()
+# # gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
+# # gr.launch()
+# distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
+# transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
+# demo = gr.Interface(
+#     # main_note,
+#     gr.Audio(sources=["microphone"]),
+#     # gr.Label(num_top_classes=4),
+#     # examples=[
+#     #     [os.path.join(os.path.dirname(__file__),"audio/recording1.wav")],
+#     #     [os.path.join(os.path.dirname(__file__),"audio/cantina.wav")],
+#     # ],
+#     outputs=[distil_transcription, transcription]
+# )
+# demo.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
+# if __name__ == "__main__":
+#     demo.launch()
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from transformers.utils import is_flash_attn_2_available
+from transformers.pipelines.audio_utils import ffmpeg_read
+import torch
+import gradio as gr
+import time
+BATCH_SIZE = 16
+MAX_AUDIO_MINS = 30  # maximum audio input in minutes
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+use_flash_attention_2 = is_flash_attn_2_available()
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    "openai/whisper-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
+)
+distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    "distil-whisper/distil-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
+)
+if not use_flash_attention_2:
+    # use flash attention from pytorch sdpa
+    model = model.to_bettertransformer()
+    distilled_model = distilled_model.to_bettertransformer()
+processor = AutoProcessor.from_pretrained("openai/whisper-small")
+model.to(device)
+distilled_model.to(device)
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    chunk_length_s=30,
+    torch_dtype=torch_dtype,
+    device=device,
+    generate_kwargs={"language": "en", "task": "transcribe"},
+    return_timestamps=True
+)
+pipe_forward = pipe._forward
+distil_pipe = pipeline(
+    "automatic-speech-recognition",
+    model=distilled_model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    chunk_length_s=15,
+    torch_dtype=torch_dtype,
+    device=device,
+    generate_kwargs={"language": "en", "task": "transcribe"},
+)
+distil_pipe_forward = distil_pipe._forward
+def transcribe(inputs):
+    if inputs is None:
+        raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
+    with open(inputs, "rb") as f:
+        inputs = f.read()
+    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
+    audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60
+    if audio_length_mins > MAX_AUDIO_MINS:
+        raise gr.Error(
+            f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
+            f"Got an audio of length {round(audio_length_mins, 3)} minutes."
+        )
+    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    def _forward_distil_time(*args, **kwargs):
+        global distil_runtime
+        start_time = time.time()
+        result = distil_pipe_forward(*args, **kwargs)
+        distil_runtime = time.time() - start_time
+        distil_runtime = round(distil_runtime, 2)
+        return result
+    distil_pipe._forward = _forward_distil_time
+    distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
+    yield distil_text, distil_runtime, None, None, None
+    def _forward_time(*args, **kwargs):
+        global runtime
+        start_time = time.time()
+        result = pipe_forward(*args, **kwargs)
+        runtime = time.time() - start_time
+        runtime = round(runtime, 2)
+        return result
+    pipe._forward = _forward_time
+    text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
+    yield distil_text, distil_runtime, text, runtime
+if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        gr.HTML(
+            """
+                <div style="text-align: center; max-width: 700px; margin: 0 auto;">
+                  <div
+                    style="
+                      display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
+                    "
+                  >
+                    <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+                      Whisper vs Distil-Whisper: Speed Comparison
+                    </h1>
+                  </div>
+                </div>
+            """
+        )
+        gr.HTML(
+            f"""
+            <p><a href="https://huggingface.co/distil-whisper/distil-small"> Distil-Whisper</a> is a distilled variant
+            of the <a href="https://huggingface.co/openai/whisper-small"> Whisper</a> model by OpenAI. Compared to Whisper,
+            Distil-Whisper runs 6x faster with 50% fewer parameters, while performing to within 1% word error rate (WER) on
+            out-of-distribution evaluation data.</p>
+            <p>In this demo, we perform a speed comparison between Whisper and Distil-Whisper in order to test this claim.
+            Both models use the <a href="https://huggingface.co/distil-whisper/distil-small#long-form-transcription"> chunked long-form transcription algorithm</a>
+            in 🤗 Transformers, as well as Flash Attention. To use Distil-Whisper yourself, check the code examples on the
+            <a href="https://github.com/huggingface/distil-whisper#1-usage"> Distil-Whisper repository</a>. To ensure fair
+            usage of the Space, we ask that audio file inputs are kept to < 30 mins.</p>
+            """
+        )
+        audio = gr.components.Audio(type="filepath", label="Audio input")
+        button = gr.Button("Transcribe")
+        with gr.Row():
+            distil_runtime = gr.components.Textbox(label="Distil-Whisper Transcription Time (s)")
+            runtime = gr.components.Textbox(label="Whisper Transcription Time (s)")
+        with gr.Row():
+            distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
+            transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
+        button.click(
+            fn=transcribe,
+            inputs=audio,
+            outputs=[distil_transcription, distil_runtime, transcription, runtime],
+        )
+        gr.Markdown("## Examples")
+        gr.Examples(
+            [["./assets/example_1.wav"], ["./assets/example_2.wav"]],
+            audio,
+            outputs=[distil_transcription, distil_runtime, transcription, runtime],
+            fn=transcribe,
+            cache_examples=False,
+        )
+    demo.queue(max_size=10).launch()
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from transformers.utils import is_flash_attn_2_available
+from transformers.pipelines.audio_utils import ffmpeg_read
+import torch
 import gradio as gr
+import time
+BATCH_SIZE = 16
+MAX_AUDIO_MINS = 30  # maximum audio input in minutes
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+use_flash_attention_2 = is_flash_attn_2_available()
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    "openai/whisper-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
+)
+distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    "distil-whisper/distil-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
+)
+if not use_flash_attention_2:
+    # use flash attention from pytorch sdpa
+    model = model.to_bettertransformer()
+    distilled_model = distilled_model.to_bettertransformer()
+processor = AutoProcessor.from_pretrained("openai/whisper-small")
+model.to(device)
+distilled_model.to(device)
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    chunk_length_s=30,
+    torch_dtype=torch_dtype,
+    device=device,
+    generate_kwargs={"language": "en", "task": "transcribe"},
+    return_timestamps=True
+)
+pipe_forward = pipe._forward
+distil_pipe = pipeline(
+    "automatic-speech-recognition",
+    model=distilled_model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    chunk_length_s=15,
+    torch_dtype=torch_dtype,
+    device=device,
+    generate_kwargs={"language": "en", "task": "transcribe"},
 )
+distil_pipe_forward = distil_pipe._forward
+def transcribe(inputs):
+    if inputs is None:
+        raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
+    with open(inputs, "rb") as f:
+        inputs = f.read()
+    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
+    audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60
+    if audio_length_mins > MAX_AUDIO_MINS:
+        raise gr.Error(
+            f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
+            f"Got an audio of length {round(audio_length_mins, 3)} minutes."
+        )
+    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    def _forward_distil_time(*args, **kwargs):
+        global distil_runtime
+        start_time = time.time()
+        result = distil_pipe_forward(*args, **kwargs)
+        distil_runtime = time.time() - start_time
+        distil_runtime = round(distil_runtime, 2)
+        return result
+    distil_pipe._forward = _forward_distil_time
+    distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
+    yield distil_text, distil_runtime, None, None, None
+    def _forward_time(*args, **kwargs):
+        global runtime
+        start_time = time.time()
+        result = pipe_forward(*args, **kwargs)
+        runtime = time.time() - start_time
+        runtime = round(runtime, 2)
+        return result
+    pipe._forward = _forward_time
+    text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
+    yield distil_text, distil_runtime, text, runtime
 if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        gr.HTML(
+            """
+                <div style="text-align: center; max-width: 700px; margin: 0 auto;">
+                  <div
+                    style="
+                      display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
+                    "
+                  >
+                    <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+                      Whisper vs Distil-Whisper: Speed Comparison
+                    </h1>
+                  </div>
+                </div>
+            """
+        )
+        gr.HTML(
+            f"""
+            <p><a href="https://huggingface.co/distil-whisper/distil-small"> Distil-Whisper</a> is a distilled variant
+            of the <a href="https://huggingface.co/openai/whisper-small"> Whisper</a> model by OpenAI. Compared to Whisper,
+            Distil-Whisper runs 6x faster with 50% fewer parameters, while performing to within 1% word error rate (WER) on
+            out-of-distribution evaluation data.</p>
+            <p>In this demo, we perform a speed comparison between Whisper and Distil-Whisper in order to test this claim.
+            Both models use the <a href="https://huggingface.co/distil-whisper/distil-small#long-form-transcription"> chunked long-form transcription algorithm</a>
+            in 🤗 Transformers, as well as Flash Attention. To use Distil-Whisper yourself, check the code examples on the
+            <a href="https://github.com/huggingface/distil-whisper#1-usage"> Distil-Whisper repository</a>. To ensure fair
+            usage of the Space, we ask that audio file inputs are kept to < 30 mins.</p>
+            """
+        )
+        audio = gr.components.Audio(type="filepath", label="Audio input")
+        button = gr.Button("Transcribe")
+        with gr.Row():
+            distil_runtime = gr.components.Textbox(label="Distil-Whisper Transcription Time (s)")
+            runtime = gr.components.Textbox(label="Whisper Transcription Time (s)")
+        with gr.Row():
+            distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
+            transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
+        button.click(
+            fn=transcribe,
+            inputs=audio,
+            outputs=[distil_transcription, distil_runtime, transcription, runtime],
+        )
+        gr.Markdown("## Examples")
+        gr.Examples(
+            [["./assets/example_1.wav"], ["./assets/example_2.wav"]],
+            audio,
+            outputs=[distil_transcription, distil_runtime, transcription, runtime],
+            fn=transcribe,
+            cache_examples=False,
+        )
+    demo.queue(max_size=10).launch()