AI-DHD commited on
Commit
fd495b9
·
1 Parent(s): c45ec11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -38
app.py CHANGED
@@ -3,32 +3,63 @@ import whisper
3
  from pytube import YouTube
4
  import os
5
 
6
- loaded_model = whisper.load_model("base")
7
- current_size = 'base'
 
 
 
 
 
 
 
 
 
 
8
 
9
- def inference(link):
10
- if link.startswith('https://'):
11
- yt = YouTube(link)
12
- path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
13
- else:
14
- path = link
15
- options = whisper.DecodingOptions(without_timestamps=True)
16
- results = loaded_model.transcribe(path)
17
- return results['text']
18
 
19
- def change_model(size):
20
- if size == current_size:
21
- return
22
- loaded_model = whisper.load_model(size)
23
- current_size = size
24
 
25
- def populate_metadata(link):
26
- if link.startswith('https://'):
27
- yt = YouTube(link)
28
- return yt.thumbnail_url, yt.title
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  title="Youtube Whisperer"
31
  description="Speech to text transcription of Youtube videos using OpenAI's Whisper"
 
32
  block = gr.Blocks()
33
 
34
  with block:
@@ -46,23 +77,18 @@ with block:
46
  )
47
  with gr.Group():
48
  with gr.Box():
49
- sz = gr.Dropdown(label="Model Size", choices=['base','small', 'medium', 'large'], value='base')
50
-
51
- link = gr.File(label="YouTube Link or Upload a Video")
52
-
53
- with gr.Row().style(mobile_collapse=False, equal_height=True):
54
- title = gr.Label(label="Video Title", placeholder="Title")
 
 
55
  img = gr.Image(label="Thumbnail")
56
- text = gr.Textbox(
57
- label="Transcription",
58
- placeholder="Transcription Output",
59
- lines=5)
60
- with gr.Row().style(mobile_collapse=False, equal_height=True):
61
  btn = gr.Button("Transcribe")
62
-
63
- # Events
64
- btn.click(inference, inputs=[link], outputs=[text])
65
- link.change(populate_metadata, inputs=[link], outputs=[img, title])
66
- sz.change(change_model, inputs=[sz], outputs=[])
67
-
68
- block.launch(debug=True)
 
3
  from pytube import YouTube
4
  import os
5
 
6
+ class GradioInference():
7
+ def __init__(self):
8
+ self.sizes = list(whisper._MODELS.keys())
9
+ self.file = "Upload audio/video"
10
+ self.current_size = "base"
11
+ self.loaded_model = whisper.load_model(self.current_size)
12
+ self.yt = None
13
+
14
+ def __call__(self, link, file, size, subs):
15
+ if self.yt is None:
16
+ self.yt = YouTube(link)
17
+ path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
18
 
19
+ if self.file == None:
20
+ self.file = path
 
 
 
 
 
 
 
21
 
22
+ if size != self.current_size:
23
+ self.loaded_model = whisper.load_model(size)
24
+ self.current_size = size
25
+ results = self.loaded_model.transcribe(path)
 
26
 
27
+ if subs == "None":
28
+ return results["text"]
29
+ elif subs == ".srt":
30
+ return self.srt(results["segments"])
31
+ elif ".csv" == ".csv":
32
+ return self.csv(results["segments"])
33
+
34
+ def srt(self, segments):
35
+ output = ""
36
+ for i, segment in enumerate(segments):
37
+ output += f"{i+1}\n"
38
+ output += f"{self.format_time(segment['start'])} --> {self.format_time(segment['end'])}\n"
39
+ output += f"{segment['text']}\n\n"
40
+ return output
41
+
42
+ def csv(self, segments):
43
+ output = ""
44
+ for segment in segments:
45
+ output += f"{segment['start']},{segment['end']},{segment['text']}\n"
46
+ return output
47
 
48
+ def format_time(self, time):
49
+ hours = time//3600
50
+ minutes = (time - hours*3600)//60
51
+ seconds = time - hours*3600 - minutes*60
52
+ milliseconds = (time - int(time))*1000
53
+ return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
54
+
55
+ def populate_metadata(self, link):
56
+ self.yt = YouTube(link)
57
+ return self.yt.thumbnail_url, self.yt.title
58
+
59
+ gio = GradioInference()
60
  title="Youtube Whisperer"
61
  description="Speech to text transcription of Youtube videos using OpenAI's Whisper"
62
+
63
  block = gr.Blocks()
64
 
65
  with block:
 
77
  )
78
  with gr.Group():
79
  with gr.Box():
80
+ with gr.Row().style(equal_height=True):
81
+ sz = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
82
+ file = gr.File(label="Upload Video/Audio")
83
+ with gr.Row().style(equal_height=True):
84
+ wt = gr.Radio(["None", ".srt", ".csv"], label="With Timestamps?")
85
+ link = gr.Textbox(label="YouTube Link")
86
+ title = gr.Label(label="Video Title")
87
+ with gr.Row().style(equal_height=True):
88
  img = gr.Image(label="Thumbnail")
89
+ text = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
90
+ with gr.Row().style(equal_height=True):
 
 
 
91
  btn = gr.Button("Transcribe")
92
+ btn.click(gio, inputs=[link, lang, sz, wt], outputs=[text])
93
+ link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
94
+ block.launch()