Spaces:
Runtime error
Runtime error
File size: 3,429 Bytes
69cdaf1 b64720b 69cdaf1 fd495b9 ef99bde fd495b9 ef99bde fd495b9 265c64d 69cdaf1 ef99bde fd495b9 ef99bde b64720b ef99bde 69cdaf1 fd495b9 69cdaf1 fd495b9 69cdaf1 fd495b9 ef99bde fd495b9 69cdaf1 4da37e0 fd495b9 69cdaf1 ef99bde fd495b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import whisper
from pytube import YouTube
#Please modify this code to allow multiple links to be uploaded for batch editing and change the output to downloadable.txt files
class GradioInference():
def __init__(self):
self.sizes = list(whisper._MODELS.keys())
self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
self.current_size = "base"
self.loaded_model = whisper.load_model(self.current_size)
self.yt = None
def __call__(self, link, lang, size, subs):
if self.yt is None:
self.yt = YouTube(link)
path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
if lang == "none":
lang = None
if size != self.current_size:
self.loaded_model = whisper.load_model(size)
self.current_size = size
results = self.loaded_model.transcribe(path, language=lang)
if subs == "None":
return results["text"]
elif subs == ".srt":
return self.srt(results["segments"])
elif ".csv" == ".csv":
return self.csv(results["segments"])
def srt(self, segments):
output = ""
for i, segment in enumerate(segments):
output += f"{i+1}\n"
output += f"{self.format_time(segment['start'])} --> {self.format_time(segment['end'])}\n"
output += f"{segment['text']}\n\n"
return output
def csv(self, segments):
output = ""
for segment in segments:
output += f"{segment['start']},{segment['end']},{segment['text']}\n"
return output
def format_time(self, time):
hours = time//3600
minutes = (time - hours*3600)//60
seconds = time - hours*3600 - minutes*60
milliseconds = (time - int(time))*1000
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
def populate_metadata(self, link):
self.yt = YouTube(link)
return self.yt.thumbnail_url, self.yt.title
gio = GradioInference()
title="Youtube Whisperer"
description="Speech to text transcription of Youtube videos using OpenAI's Whisper"
block = gr.Blocks()
with block:
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;">
<div>
<h1>Youtube Whisperer</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Speech to text transcription of Youtube videos using OpenAI's Whisper
</p>
</div>
"""
)
with gr.Group():
with gr.Box():
with gr.Row().style(equal_height=True):
sz = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
with gr.Row().style(equal_height=True):
wt = gr.Radio(["None", ".srt", ".csv"], label="With Timestamps?")
link = gr.Textbox(label="YouTube Link")
title = gr.Label(label="Video Title")
with gr.Row().style(equal_height=True):
img = gr.Image(label="Thumbnail")
text = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
with gr.Row().style(equal_height=True):
btn = gr.Button("Transcribe")
btn.click(gio, inputs=[link, lang, sz, wt], outputs=[text])
link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
block.launch() |