LucFast commited on
Commit
3d11acf
·
1 Parent(s): 3d38885

update with transcripton

Browse files
Files changed (2) hide show
  1. app.py +134 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,7 +1,137 @@
1
  import gradio as gr
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import os
3
+ import whisper
4
+ from pytube import YouTube
5
+ from yt_dlp import YoutubeDL
6
 
7
+ class GradioInference():
8
+ def __init__(self):
9
+ self.sizes = list(whisper._MODELS.keys())
10
+ self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
11
+ self.current_size = "base"
12
+ self.loaded_model = whisper.load_model(self.current_size)
13
+
14
+ def download_videos(link):
15
+ """Specify the yt-dlp parameters
16
 
17
+ Args:
18
+ url (str): URL to retrieve videl
19
+ name (str): speaker name
20
+ """
21
+ ydl_opts = {
22
+ "format": "m4a/bestaudio/best",
23
+ "postprocessors": [
24
+ { # Extract audio using ffmpeg
25
+ "key": "FFmpegExtractAudio",
26
+ "preferredcodec": "wav",
27
+ }
28
+ ],
29
+ "outtmpl": "tmp.wav",
30
+ }
31
+
32
+ with YoutubeDL(ydl_opts) as ydl:
33
+ ydl.download(link)
34
+ return "tmp.wav"
35
+
36
+
37
+ def detect_lang(self):
38
+ # load audio and pad/trim it to fit 30 seconds
39
+ audio = whisper.load_audio("tmp.wav")
40
+ audio_segment = whisper.pad_or_trim(audio)
41
+
42
+ # make log-Mel spectrogram and move to the same device as the model
43
+ mel = whisper.log_mel_spectrogram(audio_segment).to(self.loaded_model.device)
44
+
45
+ # detect the spoken language
46
+ _, probs = self.loaded_model.detect_language(mel)
47
+ language = max(probs, key=probs.get)
48
+
49
+ return language
50
+
51
+ def __call__(self, link, lang, size, subs):
52
+ if self.yt is None:
53
+ ret_path = self.download_videos(link)
54
+
55
+ if size != self.current_size:
56
+ self.loaded_model = whisper.load_model(size)
57
+ self.current_size = size
58
+
59
+ if lang == "none":
60
+ lang = self.detect_lang()
61
+
62
+ options = whisper.DecodingOptions().__dict__.copy()
63
+ options["language"] = lang
64
+ options["beam_size"] = 5
65
+ options["best_of"] = 5
66
+ del options["task"]
67
+ transcribe_options = dict(task="transcribe", **options)
68
+ translate_options = dict(task="translate", **options)
69
+ results = self.loaded_model.transcribe("tmp.wav", language=lang)
70
+
71
+ if subs == "None":
72
+ return results["text"]
73
+ elif subs == ".srt":
74
+ return self.srt(results["segments"])
75
+ elif ".csv" == ".csv":
76
+ return self.csv(results["segments"])
77
+
78
+ def srt(self, segments):
79
+ output = ""
80
+ for i, segment in enumerate(segments):
81
+ output += f"{i+1}\n"
82
+ output += f"{self.format_time(segment['start'])} --> {self.format_time(segment['end'])}\n"
83
+ output += f"{segment['text']}\n\n"
84
+ return output
85
+
86
+ def csv(self, segments):
87
+ output = ""
88
+ for segment in segments:
89
+ output += f"{segment['start']},{segment['end']},{segment['text']}\n"
90
+ return output
91
+
92
+ def format_time(self, time):
93
+ hours = time//3600
94
+ minutes = (time - hours*3600)//60
95
+ seconds = time - hours*3600 - minutes*60
96
+ milliseconds = (time - int(time))*1000
97
+ return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
98
+
99
+ def populate_metadata(self, link):
100
+ self.yt = YouTube(link)
101
+ return self.yt.thumbnail_url, self.yt.title
102
+
103
+ gio = GradioInference()
104
+ title="Youtube Whisperer"
105
+ description="Speech to text transcription of Youtube videos using OpenAI's Whisper"
106
+
107
+ block = gr.Blocks()
108
+ with block:
109
+ gr.HTML(
110
+ """
111
+ <div style="text-align: center; max-width: 500px; margin: 0 auto;">
112
+ <div>
113
+ <h1>Youtube Whisperer</h1>
114
+ </div>
115
+ <p style="margin-bottom: 10px; font-size: 94%">
116
+ Speech to text transcription of Youtube videos using OpenAI's Whisper
117
+ </p>
118
+ </div>
119
+ """
120
+ )
121
+ with gr.Group():
122
+ with gr.Box():
123
+ with gr.Row().style(equal_height=True):
124
+ sz = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
125
+ lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
126
+ with gr.Row().style(equal_height=True):
127
+ wt = gr.Radio(["None", ".srt", ".csv"], label="With Timestamps?")
128
+ link = gr.Textbox(label="YouTube Link")
129
+ title = gr.Label(label="Video Title")
130
+ with gr.Row().style(equal_height=True):
131
+ img = gr.Image(label="Thumbnail")
132
+ text = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
133
+ with gr.Row().style(equal_height=True):
134
+ btn = gr.Button("Transcribe")
135
+ btn.click(gio, inputs=[link, lang, sz, wt], outputs=[text])
136
+ link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
137
+ block.launch()
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  git+https://github.com/openai/whisper.git
2
- yt-dlp
 
 
1
  git+https://github.com/openai/whisper.git
2
+ yt-dlp
3
+ pytube