not-lain commited on
Commit
b21d0d9
·
1 Parent(s): b5d1281
Files changed (2) hide show
  1. app.py +52 -52
  2. requirements.txt +1 -1
app.py CHANGED
@@ -11,7 +11,7 @@ from PIL import Image, ImageOps
11
  import numpy as np
12
  from simple_lama_inpainting import SimpleLama
13
  from contextlib import contextmanager
14
- import whisperx
15
  import gc
16
 
17
  @contextmanager
@@ -174,61 +174,61 @@ def erase(image=None, mask=None):
174
  return simple_lama(image, mask)
175
 
176
 
177
- def transcribe(audio):
178
- if audio is None:
179
- raise gr.Error("No audio file submitted!")
180
 
181
- device = "cuda" if torch.cuda.is_available() else "cpu"
182
- compute_type = "float16"
183
- batch_size = 8 # reduced batch size to be conservative with memory
184
 
185
- try:
186
- # 1. Load model and transcribe
187
- model = whisperx.load_model("large-v2", device, compute_type=compute_type)
188
- audio_input = whisperx.load_audio(audio)
189
- result = model.transcribe(audio_input, batch_size=batch_size)
190
 
191
- # Clear GPU memory
192
- del model
193
- gc.collect()
194
- torch.cuda.empty_cache()
195
-
196
- # 2. Align whisper output
197
- model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
198
- result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
199
-
200
- # Clear GPU memory
201
- del model_a
202
- gc.collect()
203
- torch.cuda.empty_cache()
204
-
205
- # 3. Assign speaker labels
206
- diarize_model = whisperx.DiarizationPipeline(device=device)
207
- diarize_segments = diarize_model(audio_input)
208
 
209
- # Combine transcription with speaker diarization
210
- result = whisperx.assign_word_speakers(diarize_segments, result)
211
-
212
- # Format output with speaker labels and timestamps
213
- formatted_text = []
214
- for segment in result["segments"]:
215
- if not isinstance(segment, dict):
216
- continue
217
 
218
- speaker = f"[Speaker {segment.get('speaker', 'Unknown')}]"
219
- start_time = f"{float(segment.get('start', 0)):.2f}"
220
- end_time = f"{float(segment.get('end', 0)):.2f}"
221
- text = segment.get('text', '').strip()
222
- formatted_text.append(f"[{start_time}s - {end_time}s] {speaker}: {text}")
223
 
224
- return "\n".join(formatted_text)
225
 
226
- except Exception as e:
227
- raise gr.Error(f"Transcription failed: {str(e)}")
228
- finally:
229
- # Ensure GPU memory is cleared even if an error occurs
230
- gc.collect()
231
- torch.cuda.empty_cache()
232
 
233
 
234
  @spaces.GPU(duration=120)
@@ -245,8 +245,8 @@ def main(*args):
245
  # return mask_generation(*args)
246
  elif api_num == 5:
247
  return erase(*args)
248
- elif api_num == 6:
249
- return transcribe(*args)
250
 
251
 
252
  rmbg_tab = gr.Interface(
@@ -367,7 +367,7 @@ demo = gr.TabbedInterface(
367
  "inpainting",
368
  # "sam2",
369
  "erase",
370
- "transcribe",
371
  ],
372
  title="Utilities that require GPU",
373
  )
 
11
  import numpy as np
12
  from simple_lama_inpainting import SimpleLama
13
  from contextlib import contextmanager
14
+ # import whisperx
15
  import gc
16
 
17
  @contextmanager
 
174
  return simple_lama(image, mask)
175
 
176
 
177
+ # def transcribe(audio):
178
+ # if audio is None:
179
+ # raise gr.Error("No audio file submitted!")
180
 
181
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
182
+ # compute_type = "float16"
183
+ # batch_size = 8 # reduced batch size to be conservative with memory
184
 
185
+ # try:
186
+ # # 1. Load model and transcribe
187
+ # model = whisperx.load_model("large-v2", device, compute_type=compute_type)
188
+ # audio_input = whisperx.load_audio(audio)
189
+ # result = model.transcribe(audio_input, batch_size=batch_size)
190
 
191
+ # # Clear GPU memory
192
+ # del model
193
+ # gc.collect()
194
+ # torch.cuda.empty_cache()
195
+
196
+ # # 2. Align whisper output
197
+ # model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
198
+ # result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
199
+
200
+ # # Clear GPU memory
201
+ # del model_a
202
+ # gc.collect()
203
+ # torch.cuda.empty_cache()
204
+
205
+ # # 3. Assign speaker labels
206
+ # diarize_model = whisperx.DiarizationPipeline(device=device)
207
+ # diarize_segments = diarize_model(audio_input)
208
 
209
+ # # Combine transcription with speaker diarization
210
+ # result = whisperx.assign_word_speakers(diarize_segments, result)
211
+
212
+ # # Format output with speaker labels and timestamps
213
+ # formatted_text = []
214
+ # for segment in result["segments"]:
215
+ # if not isinstance(segment, dict):
216
+ # continue
217
 
218
+ # speaker = f"[Speaker {segment.get('speaker', 'Unknown')}]"
219
+ # start_time = f"{float(segment.get('start', 0)):.2f}"
220
+ # end_time = f"{float(segment.get('end', 0)):.2f}"
221
+ # text = segment.get('text', '').strip()
222
+ # formatted_text.append(f"[{start_time}s - {end_time}s] {speaker}: {text}")
223
 
224
+ # return "\n".join(formatted_text)
225
 
226
+ # except Exception as e:
227
+ # raise gr.Error(f"Transcription failed: {str(e)}")
228
+ # finally:
229
+ # # Ensure GPU memory is cleared even if an error occurs
230
+ # gc.collect()
231
+ # torch.cuda.empty_cache()
232
 
233
 
234
  @spaces.GPU(duration=120)
 
245
  # return mask_generation(*args)
246
  elif api_num == 5:
247
  return erase(*args)
248
+ # elif api_num == 6:
249
+ # return transcribe(*args)
250
 
251
 
252
  rmbg_tab = gr.Interface(
 
367
  "inpainting",
368
  # "sam2",
369
  "erase",
370
+ # "transcribe",
371
  ],
372
  title="Utilities that require GPU",
373
  )
requirements.txt CHANGED
@@ -22,4 +22,4 @@ einops
22
  # git+https://github.com/facebookresearch/sam2.git
23
  matplotlib
24
  simple-lama-inpainting
25
- git+https://github.com/m-bain/whisperX.git
 
22
  # git+https://github.com/facebookresearch/sam2.git
23
  matplotlib
24
  simple-lama-inpainting
25
+ # git+https://github.com/m-bain/whisperX.git