nb-whisper-demo

Running on T4

App Files Files Community

pere commited on Oct 8, 2024

Commit

e1a5899

1 Parent(s): f4d4476

update test

Browse files

Files changed (1) hide show

app.py +24 -20

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import time
 import os
 import torch
-import yt_dlp
 import gradio as gr
 import pytube as pt
 import spaces
@@ -25,9 +25,16 @@ print(f"Using device: {device}")
 @spaces.GPU(duration=60 * 2)
 def pipe(file, return_timestamps=False):
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
         chunk_length_s=30,
         device=device,
         token=auth_token,
@@ -39,6 +46,7 @@ def pipe(file, return_timestamps=False):
         task="transcribe",
         no_timestamps=not return_timestamps,
     )
     return asr(file, return_timestamps=return_timestamps, batch_size=24)
 def transcribe(file, return_timestamps=False):
@@ -65,18 +73,14 @@ def _return_yt_html_embed(yt_url):
     return HTML_str
-@spaces.GPU
-def yt_transcribe(yt_url, task):
     html_embed_str = _return_yt_html_embed(yt_url)
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        filepath = os.path.join(tmpdirname, "audio.mp3")
-        download_yt_audio(yt_url, filepath)
-        inputs = ffmpeg_read(filepath, pipe.feature_extractor.sampling_rate)
-        inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
     return html_embed_str, text
@@ -85,11 +89,11 @@ demo = gr.Blocks()
 mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
-        gr.Audio(sources="microphone", type="filepath"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs="text",
-    title="NB-Whisper Demo",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
@@ -101,27 +105,27 @@ mf_transcribe = gr.Interface(
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
     inputs=[
-        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=["html", "text"],
     title="Whisper Demo: Transcribe YouTube",
     description=(
         "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
-        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
         " arbitrary length."
     ),
     allow_flagging="never",
 )
 with demo:
     gr.TabbedInterface([
         mf_transcribe,
-        yt_transcribe
     ], [
-        "Transkriber Lyd",
-        "Transkriber YouTube"
     ])
 demo.launch(share=share).queue()

 import os
 import torch
 import gradio as gr
 import pytube as pt
 import spaces
 @spaces.GPU(duration=60 * 2)
 def pipe(file, return_timestamps=False):
+    # model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    # model.to(device)
+    # processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+    # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+    # model.generation_config.cache_implementation = "static"
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
+        # tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
+        # feature_extractor=AutoFeatureExtractor.from_pretrained(MODEL_NAME),
         chunk_length_s=30,
         device=device,
         token=auth_token,
         task="transcribe",
         no_timestamps=not return_timestamps,
     )
+    # asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<|notimestamps|>", add_special_tokens=False)[0]
     return asr(file, return_timestamps=return_timestamps, batch_size=24)
 def transcribe(file, return_timestamps=False):
     return HTML_str
+def yt_transcribe(yt_url, return_timestamps=False):
+    yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
+    stream = yt.streams.filter(only_audio=True)[0]
+    stream.download(filename="audio.mp3")
+    text = transcribe("audio.mp3", return_timestamps=return_timestamps)
     return html_embed_str, text
 mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
+        gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
+        gr.components.Checkbox(label="Return timestamps"),
     ],
     outputs="text",
+    title="NB-Whisper",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
     inputs=[
+        gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
+        gr.components.Checkbox(label="Return timestamps"),
     ],
+    examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
     outputs=["html", "text"],
     title="Whisper Demo: Transcribe YouTube",
     description=(
         "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
+        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
         " arbitrary length."
     ),
     allow_flagging="never",
 )
 with demo:
     gr.TabbedInterface([
         mf_transcribe,
+        # yt_transcribe
     ], [
+        "Transcribe Audio",
+        # "Transcribe YouTube"
     ])
 demo.launch(share=share).queue()