Anustup commited on
Commit
a274eae
·
1 Parent(s): 94905d9
Files changed (8) hide show
  1. README.md +66 -0
  2. app-local.py +3 -0
  3. app-network.py +3 -0
  4. app-shared.py +3 -0
  5. app.py +263 -0
  6. cli.py +109 -0
  7. dockerfile +20 -0
  8. requirements.txt +6 -0
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper Webui
3
+ emoji: ⚡
4
+ colorFrom: pink
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.3.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # Running Locally
16
+
17
+ To run this program locally, first install Python 3.9+ and Git. Then install Pytorch 10.1+ and all the other dependencies:
18
+ ```
19
+ pip install -r requirements.txt
20
+ ```
21
+
22
+ Finally, run the full version (no audio length restrictions) of the app:
23
+ ```
24
+ python app-full.py
25
+ ```
26
+
27
+ You can also run the CLI interface, which is similar to Whisper's own CLI but also supports the following additional arguments:
28
+ ```
29
+ python cli.py \
30
+ [--vad {none,silero-vad,silero-vad-skip-gaps,silero-vad-expand-into-gaps,periodic-vad}] \
31
+ [--vad_merge_window VAD_MERGE_WINDOW] \
32
+ [--vad_max_merge_size VAD_MAX_MERGE_SIZE] \
33
+ [--vad_padding VAD_PADDING] \
34
+ [--vad_prompt_window VAD_PROMPT_WINDOW]
35
+ ```
36
+ In addition, you may also use URL's in addition to file paths as input.
37
+ ```
38
+ python cli.py --model large --vad silero-vad --language Japanese "https://www.youtube.com/watch?v=4cICErqqRSM"
39
+ ```
40
+
41
+ # Docker
42
+
43
+ To run it in Docker, first install Docker and optionally the NVIDIA Container Toolkit in order to use the GPU. Then
44
+ check out this repository and build an image:
45
+ ```
46
+ sudo docker build -t whisper-webui:1 .
47
+ ```
48
+
49
+ You can then start the WebUI with GPU support like so:
50
+ ```
51
+ sudo docker run -d --gpus=all -p 7860:7860 whisper-webui:1
52
+ ```
53
+
54
+ Leave out "--gpus=all" if you don't have access to a GPU with enough memory, and are fine with running it on the CPU only:
55
+ ```
56
+ sudo docker run -d -p 7860:7860 whisper-webui:1
57
+ ```
58
+
59
+ ## Caching
60
+
61
+ Note that the models themselves are currently not included in the Docker images, and will be downloaded on the demand.
62
+ To avoid this, bind the directory /root/.cache/whisper to some directory on the host (for instance /home/administrator/.cache/whisper), where you can (optionally)
63
+ prepopulate the directory with the different Whisper models.
64
+ ```
65
+ sudo docker run -d --gpus=all -p 7860:7860 --mount type=bind,source=/home/administrator/.cache/whisper,target=/root/.cache/whisper whisper-webui:1
66
+ ```
app-local.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Run the app with no audio file restrictions
2
+ from app import create_ui
3
+ create_ui(-1)
app-network.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Run the app with no audio file restrictions, and make it available on the network
2
+ from app import create_ui
3
+ create_ui(-1, server_name="0.0.0.0")
app-shared.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Run the app with no audio file restrictions
2
+ from app import create_ui
3
+ create_ui(-1, share=True)
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterator
2
+
3
+ from io import StringIO
4
+ import os
5
+ import pathlib
6
+ import tempfile
7
+
8
+ # External programs
9
+ import whisper
10
+ import ffmpeg
11
+
12
+ # UI
13
+ import gradio as gr
14
+
15
+ from src.download import ExceededMaximumDuration, download_url
16
+ from src.utils import slugify, write_srt, write_vtt
17
+ from src.vad import NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, \
18
+ VadSileroTranscription
19
+
20
+ # Limitations (set to -1 to disable)
21
+ DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
22
+
23
+ # Whether or not to automatically delete all uploaded files, to save disk space
24
+ DELETE_UPLOADED_FILES = True
25
+
26
+ # Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
27
+ MAX_FILE_PREFIX_LENGTH = 17
28
+
29
+ LANGUAGES = [
30
+ "English", "Hindi", "Tamil", "Urdu",
31
+ "Malayalam", "Telugu", "Bengali", "Kannada",
32
+ "Nepali", "Marathi", "Punjabi", "Sindhi",
33
+ "Gujarati", "Sanskrit", "Assamese"]
34
+
35
+
36
+ class WhisperTranscriber:
37
+ def __init__(self, inputAudioMaxDuration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION,
38
+ deleteUploadedFiles: bool = DELETE_UPLOADED_FILES):
39
+ self.model_cache = dict()
40
+
41
+ self.vad_model = None
42
+ self.inputAudioMaxDuration = inputAudioMaxDuration
43
+ self.deleteUploadedFiles = deleteUploadedFiles
44
+
45
+ def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow,
46
+ vadMaxMergeSize, vadPadding, vadPromptWindow):
47
+ try:
48
+ source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
49
+
50
+ try:
51
+ selectedLanguage = languageName.lower() if len(languageName) > 0 else None
52
+ selectedModel = modelName if modelName is not None else "base"
53
+
54
+ model = self.model_cache.get(selectedModel, None)
55
+
56
+ if not model:
57
+ model = whisper.load_model(selectedModel)
58
+ self.model_cache[selectedModel] = model
59
+
60
+ # Execute whisper
61
+ result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow,
62
+ vadMaxMergeSize, vadPadding, vadPromptWindow)
63
+
64
+ # Write result
65
+ downloadDirectory = tempfile.mkdtemp()
66
+
67
+ filePrefix = slugify(sourceName, allow_unicode=True)
68
+ download, text, vtt = self.write_result(result, filePrefix, downloadDirectory)
69
+
70
+ return download, text, vtt
71
+
72
+ finally:
73
+ # Cleanup source
74
+ if self.deleteUploadedFiles:
75
+ print("Deleting source file " + source)
76
+ os.remove(source)
77
+
78
+ except ExceededMaximumDuration as e:
79
+ return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(
80
+ e.videoDuration) + "s"), "[ERROR]"
81
+
82
+ def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
83
+ vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1,
84
+ vadPromptWindow: float = 1, **decodeOptions: dict):
85
+
86
+ initial_prompt = decodeOptions.pop('initial_prompt', None)
87
+
88
+ if ('task' in decodeOptions):
89
+ task = decodeOptions.pop('task')
90
+
91
+ # Callable for processing an audio file
92
+ whisperCallable = lambda audio, segment_index, prompt, detected_language: model.transcribe(audio, \
93
+ language=language if language else detected_language,
94
+ task=task, \
95
+ initial_prompt=self._concat_prompt(
96
+ initial_prompt,
97
+ prompt) if segment_index == 0 else prompt, \
98
+ **decodeOptions)
99
+
100
+ # The results
101
+ if (vad == 'silero-vad'):
102
+ # Silero VAD where non-speech gaps are transcribed
103
+ process_gaps = self._create_silero_config(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize,
104
+ vadPadding, vadPromptWindow)
105
+ result = self.vad_model.transcribe(audio_path, whisperCallable, process_gaps)
106
+ elif (vad == 'silero-vad-skip-gaps'):
107
+ # Silero VAD where non-speech gaps are simply ignored
108
+ skip_gaps = self._create_silero_config(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding,
109
+ vadPromptWindow)
110
+ result = self.vad_model.transcribe(audio_path, whisperCallable, skip_gaps)
111
+ elif (vad == 'silero-vad-expand-into-gaps'):
112
+ # Use Silero VAD where speech-segments are expanded into non-speech gaps
113
+ expand_gaps = self._create_silero_config(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize,
114
+ vadPadding, vadPromptWindow)
115
+ result = self.vad_model.transcribe(audio_path, whisperCallable, expand_gaps)
116
+ elif (vad == 'periodic-vad'):
117
+ # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
118
+ # it may create a break in the middle of a sentence, causing some artifacts.
119
+ periodic_vad = VadPeriodicTranscription()
120
+ result = periodic_vad.transcribe(audio_path, whisperCallable,
121
+ PeriodicTranscriptionConfig(periodic_duration=vadMaxMergeSize,
122
+ max_prompt_window=vadPromptWindow))
123
+ else:
124
+ # Default VAD
125
+ result = whisperCallable(audio_path, 0, None, None)
126
+
127
+ return result
128
+
129
+ def _concat_prompt(self, prompt1, prompt2):
130
+ if (prompt1 is None):
131
+ return prompt2
132
+ elif (prompt2 is None):
133
+ return prompt1
134
+ else:
135
+ return prompt1 + " " + prompt2
136
+
137
+ def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5,
138
+ vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
139
+ # Use Silero VAD
140
+ if (self.vad_model is None):
141
+ self.vad_model = VadSileroTranscription()
142
+
143
+ config = TranscriptionConfig(non_speech_strategy=non_speech_strategy,
144
+ max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
145
+ segment_padding_left=vadPadding, segment_padding_right=vadPadding,
146
+ max_prompt_window=vadPromptWindow)
147
+
148
+ return config
149
+
150
+ def write_result(self, result: dict, source_name: str, output_dir: str):
151
+ if not os.path.exists(output_dir):
152
+ os.makedirs(output_dir)
153
+
154
+ text = result["text"]
155
+ language = result["language"]
156
+ languageMaxLineWidth = self.__get_max_line_width(language)
157
+
158
+ print("Max line width " + str(languageMaxLineWidth))
159
+ vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth)
160
+ srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth)
161
+
162
+ output_files = []
163
+ output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
164
+ output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
165
+ output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
166
+
167
+ return output_files, text, vtt
168
+
169
+ def clear_cache(self):
170
+ self.model_cache = dict()
171
+ self.vad_model = None
172
+
173
+ def __get_source(self, urlData, uploadFile, microphoneData):
174
+ if urlData:
175
+ # Download from YouTube
176
+ source = download_url(urlData, self.inputAudioMaxDuration)[0]
177
+ else:
178
+ # File input
179
+ source = uploadFile if uploadFile is not None else microphoneData
180
+
181
+ if self.inputAudioMaxDuration > 0:
182
+ # Calculate audio length
183
+ audioDuration = ffmpeg.probe(source)["format"]["duration"]
184
+
185
+ if float(audioDuration) > self.inputAudioMaxDuration:
186
+ raise ExceededMaximumDuration(videoDuration=audioDuration, maxDuration=self.inputAudioMaxDuration,
187
+ message="Video is too long")
188
+
189
+ file_path = pathlib.Path(source)
190
+ sourceName = file_path.stem[:MAX_FILE_PREFIX_LENGTH] + file_path.suffix
191
+
192
+ return source, sourceName
193
+
194
+ def __get_max_line_width(self, language: str) -> int:
195
+ if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
196
+ # Chinese characters and kana are wider, so limit line length to 40 characters
197
+ return 40
198
+ else:
199
+ # TODO: Add more languages
200
+ # 80 latin characters should fit on a 1080p/720p screen
201
+ return 80
202
+
203
+ def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
204
+ segmentStream = StringIO()
205
+
206
+ if format == 'vtt':
207
+ write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
208
+ elif format == 'srt':
209
+ write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
210
+ else:
211
+ raise Exception("Unknown format " + format)
212
+
213
+ segmentStream.seek(0)
214
+ return segmentStream.read()
215
+
216
+ def __create_file(self, text: str, directory: str, fileName: str) -> str:
217
+ # Write the text to a file
218
+ with open(os.path.join(directory, fileName), 'w+', encoding="utf-8") as file:
219
+ file.write(text)
220
+
221
+ return file.name
222
+
223
+
224
+ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
225
+ ui = WhisperTranscriber(inputAudioMaxDuration)
226
+
227
+ ui_description = "NS AI LABS CUSTOMISED WHISPER WITH CUSTOM ASR LAYERS "
228
+ ui_description += "YOU SPEAK IN ANY INDIAN LANGUAGE AT ANY CONDITION & PACE , WE WILL GIVE YOU BEST CONTEXTUAL " \
229
+ "ASR IN ENGLISH "
230
+ ui_description += " SUPPORTS HIN-ENGLISH TOO"
231
+
232
+ ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
233
+
234
+ if inputAudioMaxDuration > 0:
235
+ ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
236
+
237
+ ui_article = "CHOOSE SMALL MODEL ( WE CUSTOMISED THIS ONLY )"
238
+
239
+ demo = gr.Interface(fn=ui.transcribe_webui, description=ui_description, article=ui_article, inputs=[
240
+ gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
241
+ gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
242
+ gr.Text(label="URL (cloudfont URL, etc.)"),
243
+ gr.Audio(source="upload", type="filepath", label="Upload Audio"),
244
+ gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
245
+ gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
246
+ gr.Dropdown(
247
+ choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"],
248
+ label="VAD"),
249
+ gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
250
+ gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
251
+ gr.Number(label="VAD - Padding (s)", precision=None, value=1),
252
+ gr.Number(label="VAD - Prompt Window (s)", precision=None, value=3)
253
+ ], outputs=[
254
+ gr.File(label="Download"),
255
+ gr.Text(label="Transcription"),
256
+ gr.Text(label="Segments")
257
+ ])
258
+
259
+ demo.launch(share=share, server_name=server_name)
260
+
261
+
262
+ if __name__ == '__main__':
263
+ create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)
cli.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from urllib.parse import urlparse
4
+ import warnings
5
+ import numpy as np
6
+
7
+ import whisper
8
+
9
+ import torch
10
+ from app import LANGUAGES, WhisperTranscriber
11
+ from src.download import download_url
12
+
13
+ from src.utils import optional_float, optional_int, str2bool
14
+
15
+
16
+ def cli():
17
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
18
+ parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
19
+ parser.add_argument("--model", default="small", choices=["tiny", "base", "small", "medium", "large"], help="name of the Whisper model to use")
20
+ parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
21
+ parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
22
+ parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
23
+ parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
24
+
25
+ parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
26
+ parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
27
+
28
+ parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
29
+ parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
30
+ parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
31
+ parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
32
+ parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
33
+
34
+ parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
35
+ parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
36
+ parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
37
+ parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
38
+ parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
39
+
40
+ parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
41
+ parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
42
+ parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
43
+ parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
44
+
45
+ parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
46
+ parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
47
+ parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
48
+ parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
49
+
50
+ args = parser.parse_args().__dict__
51
+ model_name: str = args.pop("model")
52
+ model_dir: str = args.pop("model_dir")
53
+ output_dir: str = args.pop("output_dir")
54
+ device: str = args.pop("device")
55
+ os.makedirs(output_dir, exist_ok=True)
56
+
57
+ if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
58
+ warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
59
+ args["language"] = "en"
60
+
61
+ temperature = args.pop("temperature")
62
+ temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
63
+ if temperature_increment_on_fallback is not None:
64
+ temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
65
+ else:
66
+ temperature = [temperature]
67
+
68
+ vad = args.pop("vad")
69
+ vad_merge_window = args.pop("vad_merge_window")
70
+ vad_max_merge_size = args.pop("vad_max_merge_size")
71
+ vad_padding = args.pop("vad_padding")
72
+ vad_prompt_window = args.pop("vad_prompt_window")
73
+
74
+ model = whisper.load_model(model_name, device=device, download_root=model_dir)
75
+ transcriber = WhisperTranscriber(deleteUploadedFiles=False)
76
+
77
+ for audio_path in args.pop("audio"):
78
+ sources = []
79
+
80
+ # Detect URL and download the audio
81
+ if (uri_validator(audio_path)):
82
+ # Download from YouTube/URL directly
83
+ for source_path in download_url(audio_path, maxDuration=-1, destinationDirectory=output_dir, playlistItems=None):
84
+ source_name = os.path.basename(source_path)
85
+ sources.append({ "path": source_path, "name": source_name })
86
+ else:
87
+ sources.append({ "path": audio_path, "name": os.path.basename(audio_path) })
88
+
89
+ for source in sources:
90
+ source_path = source["path"]
91
+ source_name = source["name"]
92
+
93
+ result = transcriber.transcribe_file(model, source_path, temperature=temperature,
94
+ vad=vad, vadMergeWindow=vad_merge_window, vadMaxMergeSize=vad_max_merge_size,
95
+ vadPadding=vad_padding, vadPromptWindow=vad_prompt_window, **args)
96
+
97
+ transcriber.write_result(result, source_name, output_dir)
98
+
99
+ transcriber.clear_cache()
100
+
101
+ def uri_validator(x):
102
+ try:
103
+ result = urlparse(x)
104
+ return all([result.scheme, result.netloc])
105
+ except:
106
+ return False
107
+
108
+ if __name__ == '__main__':
109
+ cli()
dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM huggingface/transformers-pytorch-gpu
2
+ EXPOSE 7860
3
+
4
+ ADD . /opt/whisper-webui/
5
+
6
+ # Latest version of transformers-pytorch-gpu seems to lack tk.
7
+ # Further, pip install fails, so we must upgrade pip first.
8
+ RUN apt-get -y install python3-tk
9
+ RUN python3 -m pip install --upgrade pip &&\
10
+ python3 -m pip install -r /opt/whisper-webui/requirements.txt
11
+
12
+ # Note: Models will be downloaded on demand to the directory /root/.cache/whisper.
13
+ # You can also bind this directory in the container to somewhere on the host.
14
+
15
+ # To be able to see logs in real time
16
+ ENV PYTHONUNBUFFERED=1
17
+
18
+ WORKDIR /opt/whisper-webui/
19
+ ENTRYPOINT ["python3"]
20
+ CMD ["app-network.py"]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ git+https://github.com/openai/whisper.git
2
+ transformers
3
+ ffmpeg-python==0.2.0
4
+ gradio
5
+ yt-dlp
6
+ torchaudio