Spaces:

keelezibel
/

WhisperTT

Runtime error

App Files Files Community

LucFast commited on Jan 13, 2023

Commit

a2d7dcb

1 Parent(s): c357dd1

update audio path

Browse files

Files changed (1) hide show

app.py +8 -5

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ class GradioInference():
         self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.current_size = "base"
         self.loaded_model = whisper.load_model(self.current_size)
     def download_videos(link):
         """Specify the yt-dlp parameters
@@ -32,10 +33,11 @@ class GradioInference():
         with YoutubeDL(ydl_opts) as ydl:
             ydl.download(link)
-    def detect_lang(self):
         # load audio and pad/trim it to fit 30 seconds
-        audio = whisper.load_audio(f"{os.path.curdir}/tmp.wav")
         audio_segment = whisper.pad_or_trim(audio)
         # make log-Mel spectrogram and move to the same device as the model
@@ -49,14 +51,15 @@ class GradioInference():
     def __call__(self, link, lang, size, subs):
         if self.yt is None:
-            self.download_videos(link)
         if size != self.current_size:
             self.loaded_model = whisper.load_model(size)
             self.current_size = size
         if lang == "none":
-            lang = self.detect_lang()
         options = whisper.DecodingOptions().__dict__.copy()
         options["language"] = lang
@@ -65,7 +68,7 @@ class GradioInference():
         del options["task"]
         transcribe_options = dict(task="transcribe", **options)
         translate_options = dict(task="translate", **options)
-        results = self.loaded_model.transcribe(f"{os.path.curdir}/tmp.wav", **transcribe_options)
         if subs == "None":
             return results["text"]

         self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.current_size = "base"
         self.loaded_model = whisper.load_model(self.current_size)
+        self.yt = None
     def download_videos(link):
         """Specify the yt-dlp parameters
         with YoutubeDL(ydl_opts) as ydl:
             ydl.download(link)
+        return f"{os.path.curdir}/tmp.wav"
+    def detect_lang(self, path):
         # load audio and pad/trim it to fit 30 seconds
+        audio = whisper.load_audio(path)
         audio_segment = whisper.pad_or_trim(audio)
         # make log-Mel spectrogram and move to the same device as the model
     def __call__(self, link, lang, size, subs):
         if self.yt is None:
+            self.yt = YouTube(link)
+        path = self.download_videos(link)
         if size != self.current_size:
             self.loaded_model = whisper.load_model(size)
             self.current_size = size
         if lang == "none":
+            lang = self.detect_lang(path)
         options = whisper.DecodingOptions().__dict__.copy()
         options["language"] = lang
         del options["task"]
         transcribe_options = dict(task="transcribe", **options)
         translate_options = dict(task="translate", **options)
+        results = self.loaded_model.transcribe(path, **transcribe_options)
         if subs == "None":
             return results["text"]