Spaces:

keelezibel
/

WhisperTT

Runtime error

App Files Files Community

LucFast commited on Jan 13, 2023

Commit

3d11acf

1 Parent(s): 3d38885

update with transcripton

Browse files

Files changed (2) hide show

app.py +134 -4
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,7 +1,137 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import os
+import whisper
+from pytube import YouTube
+from yt_dlp import YoutubeDL
+class GradioInference():
+    def __init__(self):
+        self.sizes = list(whisper._MODELS.keys())
+        self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.current_size = "base"
+        self.loaded_model = whisper.load_model(self.current_size)
+    def download_videos(link):
+        """Specify the yt-dlp parameters
+        Args:
+            url (str): URL to retrieve videl
+            name (str): speaker name
+        """
+        ydl_opts = {
+            "format": "m4a/bestaudio/best",
+            "postprocessors": [
+                {  # Extract audio using ffmpeg
+                    "key": "FFmpegExtractAudio",
+                    "preferredcodec": "wav",
+                }
+            ],
+            "outtmpl": "tmp.wav",
+        }
+        with YoutubeDL(ydl_opts) as ydl:
+            ydl.download(link)
+        return "tmp.wav"
+    def detect_lang(self):
+        # load audio and pad/trim it to fit 30 seconds
+        audio = whisper.load_audio("tmp.wav")
+        audio_segment = whisper.pad_or_trim(audio)
+        # make log-Mel spectrogram and move to the same device as the model
+        mel = whisper.log_mel_spectrogram(audio_segment).to(self.loaded_model.device)
+        # detect the spoken language
+        _, probs = self.loaded_model.detect_language(mel)
+        language = max(probs, key=probs.get)
+        return language
+    def __call__(self, link, lang, size, subs):
+        if self.yt is None:
+            ret_path = self.download_videos(link)
+        if size != self.current_size:
+            self.loaded_model = whisper.load_model(size)
+            self.current_size = size
+        if lang == "none":
+            lang = self.detect_lang()
+        options = whisper.DecodingOptions().__dict__.copy()
+        options["language"] = lang
+        options["beam_size"] = 5
+        options["best_of"] = 5
+        del options["task"]
+        transcribe_options = dict(task="transcribe", **options)
+        translate_options = dict(task="translate", **options)
+        results = self.loaded_model.transcribe("tmp.wav", language=lang)
+        if subs == "None":
+            return results["text"]
+        elif subs == ".srt":
+            return self.srt(results["segments"])
+        elif ".csv" == ".csv":
+            return self.csv(results["segments"])
+    def srt(self, segments):
+        output = ""
+        for i, segment in enumerate(segments):
+            output += f"{i+1}\n"
+            output += f"{self.format_time(segment['start'])} --> {self.format_time(segment['end'])}\n"
+            output += f"{segment['text']}\n\n"
+        return output
+    def csv(self, segments):
+        output = ""
+        for segment in segments:
+            output += f"{segment['start']},{segment['end']},{segment['text']}\n"
+        return output
+    def format_time(self, time):
+        hours = time//3600
+        minutes = (time - hours*3600)//60
+        seconds = time - hours*3600 - minutes*60
+        milliseconds = (time - int(time))*1000
+        return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
+    def populate_metadata(self, link):
+        self.yt = YouTube(link)
+        return self.yt.thumbnail_url, self.yt.title
+gio = GradioInference()
+title="Youtube Whisperer"
+description="Speech to text transcription of Youtube videos using OpenAI's Whisper"
+block = gr.Blocks()
+with block:
+    gr.HTML(
+        """
+            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
+              <div>
+                <h1>Youtube Whisperer</h1>
+              </div>
+              <p style="margin-bottom: 10px; font-size: 94%">
+                Speech to text transcription of Youtube videos using OpenAI's Whisper
+              </p>
+            </div>
+        """
+    )
+    with gr.Group():
+        with gr.Box():
+            with gr.Row().style(equal_height=True):
+                sz = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
+                lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
+            with gr.Row().style(equal_height=True):
+                wt = gr.Radio(["None", ".srt", ".csv"], label="With Timestamps?")
+            link = gr.Textbox(label="YouTube Link")
+            title = gr.Label(label="Video Title")
+            with gr.Row().style(equal_height=True):
+                img = gr.Image(label="Thumbnail")
+                text = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
+            with gr.Row().style(equal_height=True):
+                btn = gr.Button("Transcribe")
+            btn.click(gio, inputs=[link, lang, sz, wt], outputs=[text])
+            link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
+block.launch()

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 git+https://github.com/openai/whisper.git
-yt-dlp

 git+https://github.com/openai/whisper.git
+yt-dlp
+pytube