File size: 3,429 Bytes
69cdaf1
 
 
b64720b
69cdaf1
fd495b9
 
 
ef99bde
fd495b9
 
 
 
ef99bde
fd495b9
 
265c64d
69cdaf1
ef99bde
 
 
fd495b9
 
 
ef99bde
 
 
 
 
 
b64720b
ef99bde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69cdaf1
fd495b9
 
 
 
 
 
 
 
 
 
 
 
69cdaf1
 
fd495b9
69cdaf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd495b9
 
ef99bde
 
 
fd495b9
 
 
69cdaf1
4da37e0
fd495b9
69cdaf1
ef99bde
fd495b9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import whisper
from pytube import YouTube
#Please modify this code to allow multiple links to be uploaded for batch editing and change the output to downloadable.txt files

class GradioInference():
  def __init__(self):
    self.sizes = list(whisper._MODELS.keys())
    self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
    self.current_size = "base"
    self.loaded_model = whisper.load_model(self.current_size)
    self.yt = None
  
  def __call__(self, link, lang, size, subs):
    if self.yt is None:
      self.yt = YouTube(link)
    path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")

    if lang == "none":
      lang = None

    if size != self.current_size:
      self.loaded_model = whisper.load_model(size)
      self.current_size = size
    results = self.loaded_model.transcribe(path, language=lang)

    if subs == "None":
      return results["text"]
    elif subs == ".srt":
      return self.srt(results["segments"])
    elif ".csv" == ".csv":
      return self.csv(results["segments"])
   
  def srt(self, segments):
    output = ""
    for i, segment in enumerate(segments):
      output += f"{i+1}\n"
      output += f"{self.format_time(segment['start'])} --> {self.format_time(segment['end'])}\n"
      output += f"{segment['text']}\n\n"
    return output
  
  def csv(self, segments):
    output = ""
    for segment in segments:
      output += f"{segment['start']},{segment['end']},{segment['text']}\n"
    return output

  def format_time(self, time):
    hours = time//3600
    minutes = (time - hours*3600)//60
    seconds = time - hours*3600 - minutes*60
    milliseconds = (time - int(time))*1000
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
    
  def populate_metadata(self, link):
    self.yt = YouTube(link)
    return self.yt.thumbnail_url, self.yt.title

gio = GradioInference()
title="Youtube Whisperer"
description="Speech to text transcription of Youtube videos using OpenAI's Whisper"

block = gr.Blocks()
with block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
              <div>
                <h1>Youtube Whisperer</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                Speech to text transcription of Youtube videos using OpenAI's Whisper
              </p>
            </div>
        """
    )
    with gr.Group():
        with gr.Box():
          with gr.Row().style(equal_height=True):
            sz = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
            lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
          with gr.Row().style(equal_height=True):
            wt = gr.Radio(["None", ".srt", ".csv"], label="With Timestamps?")
          link = gr.Textbox(label="YouTube Link")
          title = gr.Label(label="Video Title")
          with gr.Row().style(equal_height=True):
            img = gr.Image(label="Thumbnail")
            text = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
          with gr.Row().style(equal_height=True): 
              btn = gr.Button("Transcribe")       
          btn.click(gio, inputs=[link, lang, sz, wt], outputs=[text])
          link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
block.launch()