Spaces:
Sleeping
Sleeping
updated
Browse files- README.md +66 -0
- app-local.py +3 -0
- app-network.py +3 -0
- app-shared.py +3 -0
- app.py +263 -0
- cli.py +109 -0
- dockerfile +20 -0
- requirements.txt +6 -0
README.md
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Whisper Webui
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.3.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
# Running Locally
|
16 |
+
|
17 |
+
To run this program locally, first install Python 3.9+ and Git. Then install Pytorch 10.1+ and all the other dependencies:
|
18 |
+
```
|
19 |
+
pip install -r requirements.txt
|
20 |
+
```
|
21 |
+
|
22 |
+
Finally, run the full version (no audio length restrictions) of the app:
|
23 |
+
```
|
24 |
+
python app-full.py
|
25 |
+
```
|
26 |
+
|
27 |
+
You can also run the CLI interface, which is similar to Whisper's own CLI but also supports the following additional arguments:
|
28 |
+
```
|
29 |
+
python cli.py \
|
30 |
+
[--vad {none,silero-vad,silero-vad-skip-gaps,silero-vad-expand-into-gaps,periodic-vad}] \
|
31 |
+
[--vad_merge_window VAD_MERGE_WINDOW] \
|
32 |
+
[--vad_max_merge_size VAD_MAX_MERGE_SIZE] \
|
33 |
+
[--vad_padding VAD_PADDING] \
|
34 |
+
[--vad_prompt_window VAD_PROMPT_WINDOW]
|
35 |
+
```
|
36 |
+
In addition, you may also use URL's in addition to file paths as input.
|
37 |
+
```
|
38 |
+
python cli.py --model large --vad silero-vad --language Japanese "https://www.youtube.com/watch?v=4cICErqqRSM"
|
39 |
+
```
|
40 |
+
|
41 |
+
# Docker
|
42 |
+
|
43 |
+
To run it in Docker, first install Docker and optionally the NVIDIA Container Toolkit in order to use the GPU. Then
|
44 |
+
check out this repository and build an image:
|
45 |
+
```
|
46 |
+
sudo docker build -t whisper-webui:1 .
|
47 |
+
```
|
48 |
+
|
49 |
+
You can then start the WebUI with GPU support like so:
|
50 |
+
```
|
51 |
+
sudo docker run -d --gpus=all -p 7860:7860 whisper-webui:1
|
52 |
+
```
|
53 |
+
|
54 |
+
Leave out "--gpus=all" if you don't have access to a GPU with enough memory, and are fine with running it on the CPU only:
|
55 |
+
```
|
56 |
+
sudo docker run -d -p 7860:7860 whisper-webui:1
|
57 |
+
```
|
58 |
+
|
59 |
+
## Caching
|
60 |
+
|
61 |
+
Note that the models themselves are currently not included in the Docker images, and will be downloaded on the demand.
|
62 |
+
To avoid this, bind the directory /root/.cache/whisper to some directory on the host (for instance /home/administrator/.cache/whisper), where you can (optionally)
|
63 |
+
prepopulate the directory with the different Whisper models.
|
64 |
+
```
|
65 |
+
sudo docker run -d --gpus=all -p 7860:7860 --mount type=bind,source=/home/administrator/.cache/whisper,target=/root/.cache/whisper whisper-webui:1
|
66 |
+
```
|
app-local.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Run the app with no audio file restrictions
|
2 |
+
from app import create_ui
|
3 |
+
create_ui(-1)
|
app-network.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Run the app with no audio file restrictions, and make it available on the network
|
2 |
+
from app import create_ui
|
3 |
+
create_ui(-1, server_name="0.0.0.0")
|
app-shared.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Run the app with no audio file restrictions
|
2 |
+
from app import create_ui
|
3 |
+
create_ui(-1, share=True)
|
app.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Iterator
|
2 |
+
|
3 |
+
from io import StringIO
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import tempfile
|
7 |
+
|
8 |
+
# External programs
|
9 |
+
import whisper
|
10 |
+
import ffmpeg
|
11 |
+
|
12 |
+
# UI
|
13 |
+
import gradio as gr
|
14 |
+
|
15 |
+
from src.download import ExceededMaximumDuration, download_url
|
16 |
+
from src.utils import slugify, write_srt, write_vtt
|
17 |
+
from src.vad import NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, \
|
18 |
+
VadSileroTranscription
|
19 |
+
|
20 |
+
# Limitations (set to -1 to disable)
|
21 |
+
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
22 |
+
|
23 |
+
# Whether or not to automatically delete all uploaded files, to save disk space
|
24 |
+
DELETE_UPLOADED_FILES = True
|
25 |
+
|
26 |
+
# Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
|
27 |
+
MAX_FILE_PREFIX_LENGTH = 17
|
28 |
+
|
29 |
+
LANGUAGES = [
|
30 |
+
"English", "Hindi", "Tamil", "Urdu",
|
31 |
+
"Malayalam", "Telugu", "Bengali", "Kannada",
|
32 |
+
"Nepali", "Marathi", "Punjabi", "Sindhi",
|
33 |
+
"Gujarati", "Sanskrit", "Assamese"]
|
34 |
+
|
35 |
+
|
36 |
+
class WhisperTranscriber:
|
37 |
+
def __init__(self, inputAudioMaxDuration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION,
|
38 |
+
deleteUploadedFiles: bool = DELETE_UPLOADED_FILES):
|
39 |
+
self.model_cache = dict()
|
40 |
+
|
41 |
+
self.vad_model = None
|
42 |
+
self.inputAudioMaxDuration = inputAudioMaxDuration
|
43 |
+
self.deleteUploadedFiles = deleteUploadedFiles
|
44 |
+
|
45 |
+
def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow,
|
46 |
+
vadMaxMergeSize, vadPadding, vadPromptWindow):
|
47 |
+
try:
|
48 |
+
source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
|
49 |
+
|
50 |
+
try:
|
51 |
+
selectedLanguage = languageName.lower() if len(languageName) > 0 else None
|
52 |
+
selectedModel = modelName if modelName is not None else "base"
|
53 |
+
|
54 |
+
model = self.model_cache.get(selectedModel, None)
|
55 |
+
|
56 |
+
if not model:
|
57 |
+
model = whisper.load_model(selectedModel)
|
58 |
+
self.model_cache[selectedModel] = model
|
59 |
+
|
60 |
+
# Execute whisper
|
61 |
+
result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow,
|
62 |
+
vadMaxMergeSize, vadPadding, vadPromptWindow)
|
63 |
+
|
64 |
+
# Write result
|
65 |
+
downloadDirectory = tempfile.mkdtemp()
|
66 |
+
|
67 |
+
filePrefix = slugify(sourceName, allow_unicode=True)
|
68 |
+
download, text, vtt = self.write_result(result, filePrefix, downloadDirectory)
|
69 |
+
|
70 |
+
return download, text, vtt
|
71 |
+
|
72 |
+
finally:
|
73 |
+
# Cleanup source
|
74 |
+
if self.deleteUploadedFiles:
|
75 |
+
print("Deleting source file " + source)
|
76 |
+
os.remove(source)
|
77 |
+
|
78 |
+
except ExceededMaximumDuration as e:
|
79 |
+
return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(
|
80 |
+
e.videoDuration) + "s"), "[ERROR]"
|
81 |
+
|
82 |
+
def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
|
83 |
+
vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1,
|
84 |
+
vadPromptWindow: float = 1, **decodeOptions: dict):
|
85 |
+
|
86 |
+
initial_prompt = decodeOptions.pop('initial_prompt', None)
|
87 |
+
|
88 |
+
if ('task' in decodeOptions):
|
89 |
+
task = decodeOptions.pop('task')
|
90 |
+
|
91 |
+
# Callable for processing an audio file
|
92 |
+
whisperCallable = lambda audio, segment_index, prompt, detected_language: model.transcribe(audio, \
|
93 |
+
language=language if language else detected_language,
|
94 |
+
task=task, \
|
95 |
+
initial_prompt=self._concat_prompt(
|
96 |
+
initial_prompt,
|
97 |
+
prompt) if segment_index == 0 else prompt, \
|
98 |
+
**decodeOptions)
|
99 |
+
|
100 |
+
# The results
|
101 |
+
if (vad == 'silero-vad'):
|
102 |
+
# Silero VAD where non-speech gaps are transcribed
|
103 |
+
process_gaps = self._create_silero_config(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize,
|
104 |
+
vadPadding, vadPromptWindow)
|
105 |
+
result = self.vad_model.transcribe(audio_path, whisperCallable, process_gaps)
|
106 |
+
elif (vad == 'silero-vad-skip-gaps'):
|
107 |
+
# Silero VAD where non-speech gaps are simply ignored
|
108 |
+
skip_gaps = self._create_silero_config(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding,
|
109 |
+
vadPromptWindow)
|
110 |
+
result = self.vad_model.transcribe(audio_path, whisperCallable, skip_gaps)
|
111 |
+
elif (vad == 'silero-vad-expand-into-gaps'):
|
112 |
+
# Use Silero VAD where speech-segments are expanded into non-speech gaps
|
113 |
+
expand_gaps = self._create_silero_config(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize,
|
114 |
+
vadPadding, vadPromptWindow)
|
115 |
+
result = self.vad_model.transcribe(audio_path, whisperCallable, expand_gaps)
|
116 |
+
elif (vad == 'periodic-vad'):
|
117 |
+
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
118 |
+
# it may create a break in the middle of a sentence, causing some artifacts.
|
119 |
+
periodic_vad = VadPeriodicTranscription()
|
120 |
+
result = periodic_vad.transcribe(audio_path, whisperCallable,
|
121 |
+
PeriodicTranscriptionConfig(periodic_duration=vadMaxMergeSize,
|
122 |
+
max_prompt_window=vadPromptWindow))
|
123 |
+
else:
|
124 |
+
# Default VAD
|
125 |
+
result = whisperCallable(audio_path, 0, None, None)
|
126 |
+
|
127 |
+
return result
|
128 |
+
|
129 |
+
def _concat_prompt(self, prompt1, prompt2):
|
130 |
+
if (prompt1 is None):
|
131 |
+
return prompt2
|
132 |
+
elif (prompt2 is None):
|
133 |
+
return prompt1
|
134 |
+
else:
|
135 |
+
return prompt1 + " " + prompt2
|
136 |
+
|
137 |
+
def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5,
|
138 |
+
vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
|
139 |
+
# Use Silero VAD
|
140 |
+
if (self.vad_model is None):
|
141 |
+
self.vad_model = VadSileroTranscription()
|
142 |
+
|
143 |
+
config = TranscriptionConfig(non_speech_strategy=non_speech_strategy,
|
144 |
+
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
145 |
+
segment_padding_left=vadPadding, segment_padding_right=vadPadding,
|
146 |
+
max_prompt_window=vadPromptWindow)
|
147 |
+
|
148 |
+
return config
|
149 |
+
|
150 |
+
def write_result(self, result: dict, source_name: str, output_dir: str):
|
151 |
+
if not os.path.exists(output_dir):
|
152 |
+
os.makedirs(output_dir)
|
153 |
+
|
154 |
+
text = result["text"]
|
155 |
+
language = result["language"]
|
156 |
+
languageMaxLineWidth = self.__get_max_line_width(language)
|
157 |
+
|
158 |
+
print("Max line width " + str(languageMaxLineWidth))
|
159 |
+
vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth)
|
160 |
+
srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth)
|
161 |
+
|
162 |
+
output_files = []
|
163 |
+
output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
|
164 |
+
output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
|
165 |
+
output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
|
166 |
+
|
167 |
+
return output_files, text, vtt
|
168 |
+
|
169 |
+
def clear_cache(self):
|
170 |
+
self.model_cache = dict()
|
171 |
+
self.vad_model = None
|
172 |
+
|
173 |
+
def __get_source(self, urlData, uploadFile, microphoneData):
|
174 |
+
if urlData:
|
175 |
+
# Download from YouTube
|
176 |
+
source = download_url(urlData, self.inputAudioMaxDuration)[0]
|
177 |
+
else:
|
178 |
+
# File input
|
179 |
+
source = uploadFile if uploadFile is not None else microphoneData
|
180 |
+
|
181 |
+
if self.inputAudioMaxDuration > 0:
|
182 |
+
# Calculate audio length
|
183 |
+
audioDuration = ffmpeg.probe(source)["format"]["duration"]
|
184 |
+
|
185 |
+
if float(audioDuration) > self.inputAudioMaxDuration:
|
186 |
+
raise ExceededMaximumDuration(videoDuration=audioDuration, maxDuration=self.inputAudioMaxDuration,
|
187 |
+
message="Video is too long")
|
188 |
+
|
189 |
+
file_path = pathlib.Path(source)
|
190 |
+
sourceName = file_path.stem[:MAX_FILE_PREFIX_LENGTH] + file_path.suffix
|
191 |
+
|
192 |
+
return source, sourceName
|
193 |
+
|
194 |
+
def __get_max_line_width(self, language: str) -> int:
|
195 |
+
if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
|
196 |
+
# Chinese characters and kana are wider, so limit line length to 40 characters
|
197 |
+
return 40
|
198 |
+
else:
|
199 |
+
# TODO: Add more languages
|
200 |
+
# 80 latin characters should fit on a 1080p/720p screen
|
201 |
+
return 80
|
202 |
+
|
203 |
+
def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
|
204 |
+
segmentStream = StringIO()
|
205 |
+
|
206 |
+
if format == 'vtt':
|
207 |
+
write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
208 |
+
elif format == 'srt':
|
209 |
+
write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
210 |
+
else:
|
211 |
+
raise Exception("Unknown format " + format)
|
212 |
+
|
213 |
+
segmentStream.seek(0)
|
214 |
+
return segmentStream.read()
|
215 |
+
|
216 |
+
def __create_file(self, text: str, directory: str, fileName: str) -> str:
|
217 |
+
# Write the text to a file
|
218 |
+
with open(os.path.join(directory, fileName), 'w+', encoding="utf-8") as file:
|
219 |
+
file.write(text)
|
220 |
+
|
221 |
+
return file.name
|
222 |
+
|
223 |
+
|
224 |
+
def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
|
225 |
+
ui = WhisperTranscriber(inputAudioMaxDuration)
|
226 |
+
|
227 |
+
ui_description = "NS AI LABS CUSTOMISED WHISPER WITH CUSTOM ASR LAYERS "
|
228 |
+
ui_description += "YOU SPEAK IN ANY INDIAN LANGUAGE AT ANY CONDITION & PACE , WE WILL GIVE YOU BEST CONTEXTUAL " \
|
229 |
+
"ASR IN ENGLISH "
|
230 |
+
ui_description += " SUPPORTS HIN-ENGLISH TOO"
|
231 |
+
|
232 |
+
ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
|
233 |
+
|
234 |
+
if inputAudioMaxDuration > 0:
|
235 |
+
ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
|
236 |
+
|
237 |
+
ui_article = "CHOOSE SMALL MODEL ( WE CUSTOMISED THIS ONLY )"
|
238 |
+
|
239 |
+
demo = gr.Interface(fn=ui.transcribe_webui, description=ui_description, article=ui_article, inputs=[
|
240 |
+
gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
|
241 |
+
gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
|
242 |
+
gr.Text(label="URL (cloudfont URL, etc.)"),
|
243 |
+
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
244 |
+
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
245 |
+
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
246 |
+
gr.Dropdown(
|
247 |
+
choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"],
|
248 |
+
label="VAD"),
|
249 |
+
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
250 |
+
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
251 |
+
gr.Number(label="VAD - Padding (s)", precision=None, value=1),
|
252 |
+
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=3)
|
253 |
+
], outputs=[
|
254 |
+
gr.File(label="Download"),
|
255 |
+
gr.Text(label="Transcription"),
|
256 |
+
gr.Text(label="Segments")
|
257 |
+
])
|
258 |
+
|
259 |
+
demo.launch(share=share, server_name=server_name)
|
260 |
+
|
261 |
+
|
262 |
+
if __name__ == '__main__':
|
263 |
+
create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)
|
cli.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from urllib.parse import urlparse
|
4 |
+
import warnings
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import whisper
|
8 |
+
|
9 |
+
import torch
|
10 |
+
from app import LANGUAGES, WhisperTranscriber
|
11 |
+
from src.download import download_url
|
12 |
+
|
13 |
+
from src.utils import optional_float, optional_int, str2bool
|
14 |
+
|
15 |
+
|
16 |
+
def cli():
|
17 |
+
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
18 |
+
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
|
19 |
+
parser.add_argument("--model", default="small", choices=["tiny", "base", "small", "medium", "large"], help="name of the Whisper model to use")
|
20 |
+
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
|
21 |
+
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
|
22 |
+
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
|
23 |
+
parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
|
24 |
+
|
25 |
+
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
26 |
+
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
|
27 |
+
|
28 |
+
parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
|
29 |
+
parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
|
30 |
+
parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
|
31 |
+
parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
|
32 |
+
parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
|
33 |
+
|
34 |
+
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
|
35 |
+
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
|
36 |
+
parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
|
37 |
+
parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
|
38 |
+
parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
|
39 |
+
|
40 |
+
parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
|
41 |
+
parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
|
42 |
+
parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
|
43 |
+
parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
|
44 |
+
|
45 |
+
parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
|
46 |
+
parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
|
47 |
+
parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
|
48 |
+
parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
|
49 |
+
|
50 |
+
args = parser.parse_args().__dict__
|
51 |
+
model_name: str = args.pop("model")
|
52 |
+
model_dir: str = args.pop("model_dir")
|
53 |
+
output_dir: str = args.pop("output_dir")
|
54 |
+
device: str = args.pop("device")
|
55 |
+
os.makedirs(output_dir, exist_ok=True)
|
56 |
+
|
57 |
+
if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
|
58 |
+
warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
|
59 |
+
args["language"] = "en"
|
60 |
+
|
61 |
+
temperature = args.pop("temperature")
|
62 |
+
temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
|
63 |
+
if temperature_increment_on_fallback is not None:
|
64 |
+
temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
|
65 |
+
else:
|
66 |
+
temperature = [temperature]
|
67 |
+
|
68 |
+
vad = args.pop("vad")
|
69 |
+
vad_merge_window = args.pop("vad_merge_window")
|
70 |
+
vad_max_merge_size = args.pop("vad_max_merge_size")
|
71 |
+
vad_padding = args.pop("vad_padding")
|
72 |
+
vad_prompt_window = args.pop("vad_prompt_window")
|
73 |
+
|
74 |
+
model = whisper.load_model(model_name, device=device, download_root=model_dir)
|
75 |
+
transcriber = WhisperTranscriber(deleteUploadedFiles=False)
|
76 |
+
|
77 |
+
for audio_path in args.pop("audio"):
|
78 |
+
sources = []
|
79 |
+
|
80 |
+
# Detect URL and download the audio
|
81 |
+
if (uri_validator(audio_path)):
|
82 |
+
# Download from YouTube/URL directly
|
83 |
+
for source_path in download_url(audio_path, maxDuration=-1, destinationDirectory=output_dir, playlistItems=None):
|
84 |
+
source_name = os.path.basename(source_path)
|
85 |
+
sources.append({ "path": source_path, "name": source_name })
|
86 |
+
else:
|
87 |
+
sources.append({ "path": audio_path, "name": os.path.basename(audio_path) })
|
88 |
+
|
89 |
+
for source in sources:
|
90 |
+
source_path = source["path"]
|
91 |
+
source_name = source["name"]
|
92 |
+
|
93 |
+
result = transcriber.transcribe_file(model, source_path, temperature=temperature,
|
94 |
+
vad=vad, vadMergeWindow=vad_merge_window, vadMaxMergeSize=vad_max_merge_size,
|
95 |
+
vadPadding=vad_padding, vadPromptWindow=vad_prompt_window, **args)
|
96 |
+
|
97 |
+
transcriber.write_result(result, source_name, output_dir)
|
98 |
+
|
99 |
+
transcriber.clear_cache()
|
100 |
+
|
101 |
+
def uri_validator(x):
|
102 |
+
try:
|
103 |
+
result = urlparse(x)
|
104 |
+
return all([result.scheme, result.netloc])
|
105 |
+
except:
|
106 |
+
return False
|
107 |
+
|
108 |
+
if __name__ == '__main__':
|
109 |
+
cli()
|
dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM huggingface/transformers-pytorch-gpu
|
2 |
+
EXPOSE 7860
|
3 |
+
|
4 |
+
ADD . /opt/whisper-webui/
|
5 |
+
|
6 |
+
# Latest version of transformers-pytorch-gpu seems to lack tk.
|
7 |
+
# Further, pip install fails, so we must upgrade pip first.
|
8 |
+
RUN apt-get -y install python3-tk
|
9 |
+
RUN python3 -m pip install --upgrade pip &&\
|
10 |
+
python3 -m pip install -r /opt/whisper-webui/requirements.txt
|
11 |
+
|
12 |
+
# Note: Models will be downloaded on demand to the directory /root/.cache/whisper.
|
13 |
+
# You can also bind this directory in the container to somewhere on the host.
|
14 |
+
|
15 |
+
# To be able to see logs in real time
|
16 |
+
ENV PYTHONUNBUFFERED=1
|
17 |
+
|
18 |
+
WORKDIR /opt/whisper-webui/
|
19 |
+
ENTRYPOINT ["python3"]
|
20 |
+
CMD ["app-network.py"]
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/openai/whisper.git
|
2 |
+
transformers
|
3 |
+
ffmpeg-python==0.2.0
|
4 |
+
gradio
|
5 |
+
yt-dlp
|
6 |
+
torchaudio
|