LucFast commited on
Commit
a2d7dcb
·
1 Parent(s): c357dd1

update audio path

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -10,6 +10,7 @@ class GradioInference():
10
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
11
  self.current_size = "base"
12
  self.loaded_model = whisper.load_model(self.current_size)
 
13
 
14
  def download_videos(link):
15
  """Specify the yt-dlp parameters
@@ -32,10 +33,11 @@ class GradioInference():
32
  with YoutubeDL(ydl_opts) as ydl:
33
  ydl.download(link)
34
 
 
35
 
36
- def detect_lang(self):
37
  # load audio and pad/trim it to fit 30 seconds
38
- audio = whisper.load_audio(f"{os.path.curdir}/tmp.wav")
39
  audio_segment = whisper.pad_or_trim(audio)
40
 
41
  # make log-Mel spectrogram and move to the same device as the model
@@ -49,14 +51,15 @@ class GradioInference():
49
 
50
  def __call__(self, link, lang, size, subs):
51
  if self.yt is None:
52
- self.download_videos(link)
 
53
 
54
  if size != self.current_size:
55
  self.loaded_model = whisper.load_model(size)
56
  self.current_size = size
57
 
58
  if lang == "none":
59
- lang = self.detect_lang()
60
 
61
  options = whisper.DecodingOptions().__dict__.copy()
62
  options["language"] = lang
@@ -65,7 +68,7 @@ class GradioInference():
65
  del options["task"]
66
  transcribe_options = dict(task="transcribe", **options)
67
  translate_options = dict(task="translate", **options)
68
- results = self.loaded_model.transcribe(f"{os.path.curdir}/tmp.wav", **transcribe_options)
69
 
70
  if subs == "None":
71
  return results["text"]
 
10
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
11
  self.current_size = "base"
12
  self.loaded_model = whisper.load_model(self.current_size)
13
+ self.yt = None
14
 
15
  def download_videos(link):
16
  """Specify the yt-dlp parameters
 
33
  with YoutubeDL(ydl_opts) as ydl:
34
  ydl.download(link)
35
 
36
+ return f"{os.path.curdir}/tmp.wav"
37
 
38
+ def detect_lang(self, path):
39
  # load audio and pad/trim it to fit 30 seconds
40
+ audio = whisper.load_audio(path)
41
  audio_segment = whisper.pad_or_trim(audio)
42
 
43
  # make log-Mel spectrogram and move to the same device as the model
 
51
 
52
  def __call__(self, link, lang, size, subs):
53
  if self.yt is None:
54
+ self.yt = YouTube(link)
55
+ path = self.download_videos(link)
56
 
57
  if size != self.current_size:
58
  self.loaded_model = whisper.load_model(size)
59
  self.current_size = size
60
 
61
  if lang == "none":
62
+ lang = self.detect_lang(path)
63
 
64
  options = whisper.DecodingOptions().__dict__.copy()
65
  options["language"] = lang
 
68
  del options["task"]
69
  transcribe_options = dict(task="transcribe", **options)
70
  translate_options = dict(task="translate", **options)
71
+ results = self.loaded_model.transcribe(path, **transcribe_options)
72
 
73
  if subs == "None":
74
  return results["text"]