not-lain commited on
Commit
ad44b76
·
1 Parent(s): 4c8ba03

switch to whisperx

Browse files
Files changed (2) hide show
  1. app.py +41 -14
  2. requirements.txt +2 -1
app.py CHANGED
@@ -11,7 +11,8 @@ from PIL import Image, ImageOps
11
  import numpy as np
12
  from simple_lama_inpainting import SimpleLama
13
  from contextlib import contextmanager
14
-
 
15
 
16
  @contextmanager
17
  def float32_high_matmul_precision():
@@ -173,23 +174,49 @@ def erase(image=None, mask=None):
173
  return simple_lama(image, mask)
174
 
175
 
176
- # Initialize Whisper model
177
- whisper = pipeline(
178
- task="automatic-speech-recognition",
179
- model="openai/whisper-large-v3",
180
- chunk_length_s=30,
181
- device="cuda" if torch.cuda.is_available() else "cpu",
182
- )
183
-
184
-
185
  def transcribe(audio, task="transcribe"):
186
  if audio is None:
187
  raise gr.Error("No audio file submitted!")
188
 
189
- text = whisper(
190
- audio, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True
191
- )["text"]
192
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
 
195
  @spaces.GPU(duration=120)
 
11
  import numpy as np
12
  from simple_lama_inpainting import SimpleLama
13
  from contextlib import contextmanager
14
+ import whisperx
15
+ import gc
16
 
17
  @contextmanager
18
  def float32_high_matmul_precision():
 
174
  return simple_lama(image, mask)
175
 
176
 
 
 
 
 
 
 
 
 
 
177
  def transcribe(audio, task="transcribe"):
178
  if audio is None:
179
  raise gr.Error("No audio file submitted!")
180
 
181
+ device = "cuda" if torch.cuda.is_available() else "cpu"
182
+ compute_type = "float16" # can be changed to "int8" if low on GPU memory
183
+ batch_size = 8 # reduced batch size to be conservative with memory
184
+
185
+ # 1. Load model and transcribe
186
+ model = whisperx.load_model("large-v2", device, compute_type=compute_type)
187
+ audio_input = whisperx.load_audio(audio)
188
+ result = model.transcribe(audio_input, batch_size=batch_size)
189
+
190
+ # Clear GPU memory
191
+ del model
192
+ gc.collect()
193
+ torch.cuda.empty_cache()
194
+
195
+ # 2. Align whisper output
196
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
197
+ result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
198
+
199
+ # Clear GPU memory
200
+ del model_a
201
+ gc.collect()
202
+ torch.cuda.empty_cache()
203
+
204
+ # 3. Assign speaker labels
205
+ diarize_model = whisperx.DiarizationPipeline(device=device)
206
+ diarize_segments = diarize_model(audio_input)
207
+
208
+ # Combine transcription with speaker diarization
209
+ result = whisperx.assign_word_speakers(diarize_segments, result)
210
+
211
+ # Format output with speaker labels and timestamps
212
+ formatted_text = ""
213
+ for segment in result["segments"]:
214
+ speaker = f"[Speaker {segment['speaker']}]" if "speaker" in segment else ""
215
+ start_time = f"{segment.get('start', 0):.2f}"
216
+ end_time = f"{segment.get('end', 0):.2f}"
217
+ formatted_text += f"[{start_time}s - {end_time}s] {speaker}: {segment['text']}\n"
218
+
219
+ return formatted_text
220
 
221
 
222
  @spaces.GPU(duration=120)
requirements.txt CHANGED
@@ -21,4 +21,5 @@ sentencepiece
21
  einops
22
  # git+https://github.com/facebookresearch/sam2.git
23
  matplotlib
24
- simple-lama-inpainting
 
 
21
  einops
22
  # git+https://github.com/facebookresearch/sam2.git
23
  matplotlib
24
+ simple-lama-inpainting
25
+ git+https://github.com/m-bain/whisperX.git