switch to whisperx
Browse files- app.py +41 -14
- requirements.txt +2 -1
app.py
CHANGED
@@ -11,7 +11,8 @@ from PIL import Image, ImageOps
|
|
11 |
import numpy as np
|
12 |
from simple_lama_inpainting import SimpleLama
|
13 |
from contextlib import contextmanager
|
14 |
-
|
|
|
15 |
|
16 |
@contextmanager
|
17 |
def float32_high_matmul_precision():
|
@@ -173,23 +174,49 @@ def erase(image=None, mask=None):
|
|
173 |
return simple_lama(image, mask)
|
174 |
|
175 |
|
176 |
-
# Initialize Whisper model
|
177 |
-
whisper = pipeline(
|
178 |
-
task="automatic-speech-recognition",
|
179 |
-
model="openai/whisper-large-v3",
|
180 |
-
chunk_length_s=30,
|
181 |
-
device="cuda" if torch.cuda.is_available() else "cpu",
|
182 |
-
)
|
183 |
-
|
184 |
-
|
185 |
def transcribe(audio, task="transcribe"):
|
186 |
if audio is None:
|
187 |
raise gr.Error("No audio file submitted!")
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
|
195 |
@spaces.GPU(duration=120)
|
|
|
11 |
import numpy as np
|
12 |
from simple_lama_inpainting import SimpleLama
|
13 |
from contextlib import contextmanager
|
14 |
+
import whisperx
|
15 |
+
import gc
|
16 |
|
17 |
@contextmanager
|
18 |
def float32_high_matmul_precision():
|
|
|
174 |
return simple_lama(image, mask)
|
175 |
|
176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
def transcribe(audio, task="transcribe"):
|
178 |
if audio is None:
|
179 |
raise gr.Error("No audio file submitted!")
|
180 |
|
181 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
182 |
+
compute_type = "float16" # can be changed to "int8" if low on GPU memory
|
183 |
+
batch_size = 8 # reduced batch size to be conservative with memory
|
184 |
+
|
185 |
+
# 1. Load model and transcribe
|
186 |
+
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
|
187 |
+
audio_input = whisperx.load_audio(audio)
|
188 |
+
result = model.transcribe(audio_input, batch_size=batch_size)
|
189 |
+
|
190 |
+
# Clear GPU memory
|
191 |
+
del model
|
192 |
+
gc.collect()
|
193 |
+
torch.cuda.empty_cache()
|
194 |
+
|
195 |
+
# 2. Align whisper output
|
196 |
+
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
197 |
+
result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
|
198 |
+
|
199 |
+
# Clear GPU memory
|
200 |
+
del model_a
|
201 |
+
gc.collect()
|
202 |
+
torch.cuda.empty_cache()
|
203 |
+
|
204 |
+
# 3. Assign speaker labels
|
205 |
+
diarize_model = whisperx.DiarizationPipeline(device=device)
|
206 |
+
diarize_segments = diarize_model(audio_input)
|
207 |
+
|
208 |
+
# Combine transcription with speaker diarization
|
209 |
+
result = whisperx.assign_word_speakers(diarize_segments, result)
|
210 |
+
|
211 |
+
# Format output with speaker labels and timestamps
|
212 |
+
formatted_text = ""
|
213 |
+
for segment in result["segments"]:
|
214 |
+
speaker = f"[Speaker {segment['speaker']}]" if "speaker" in segment else ""
|
215 |
+
start_time = f"{segment.get('start', 0):.2f}"
|
216 |
+
end_time = f"{segment.get('end', 0):.2f}"
|
217 |
+
formatted_text += f"[{start_time}s - {end_time}s] {speaker}: {segment['text']}\n"
|
218 |
+
|
219 |
+
return formatted_text
|
220 |
|
221 |
|
222 |
@spaces.GPU(duration=120)
|
requirements.txt
CHANGED
@@ -21,4 +21,5 @@ sentencepiece
|
|
21 |
einops
|
22 |
# git+https://github.com/facebookresearch/sam2.git
|
23 |
matplotlib
|
24 |
-
simple-lama-inpainting
|
|
|
|
21 |
einops
|
22 |
# git+https://github.com/facebookresearch/sam2.git
|
23 |
matplotlib
|
24 |
+
simple-lama-inpainting
|
25 |
+
git+https://github.com/m-bain/whisperX.git
|