Spaces:
Running
on
Zero
Running
on
Zero
Nithin Rao Koluguri
commited on
Commit
·
7e3fe0c
1
Parent(s):
025dfc0
Add support for longer audio inference
Browse filesSigned-off-by: Nithin Rao Koluguri <nithinraok>
app.py
CHANGED
@@ -90,6 +90,7 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
|
|
90 |
try:
|
91 |
gr.Info(f"Loading audio: {original_path_name}", duration=2)
|
92 |
audio = AudioSegment.from_file(audio_path)
|
|
|
93 |
except Exception as load_e:
|
94 |
gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
|
95 |
# Return an update to hide the button
|
@@ -137,9 +138,27 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
|
|
137 |
transcribe_path = audio_path
|
138 |
info_path_name = original_path_name
|
139 |
|
|
|
|
|
140 |
try:
|
141 |
model.to(device)
|
|
|
142 |
gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
output = model.transcribe([transcribe_path], timestamps=True)
|
144 |
|
145 |
if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
|
@@ -194,7 +213,20 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
|
|
194 |
# Return an update to hide the button
|
195 |
return vis_data, raw_times_data, audio_path, gr.DownloadButton(visible=False)
|
196 |
finally:
|
|
|
197 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
if 'model' in locals() and hasattr(model, 'cpu'):
|
199 |
if device == 'cuda':
|
200 |
model.cpu()
|
@@ -204,6 +236,7 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
|
|
204 |
except Exception as cleanup_e:
|
205 |
print(f"Error during model cleanup: {cleanup_e}")
|
206 |
gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
|
|
|
207 |
|
208 |
finally:
|
209 |
if processed_audio_path and os.path.exists(processed_audio_path):
|
@@ -253,7 +286,7 @@ article = (
|
|
253 |
"<ul style='font-size: 1.1em;'>"
|
254 |
" <li>Automatic punctuation and capitalization</li>"
|
255 |
" <li>Accurate word-level timestamps (click on a segment in the table below to play it!)</li>"
|
256 |
-
" <li>Efficiently transcribes long audio segments (
|
257 |
" <li>Robust performance on spoken numbers, and song lyrics transcription </li>"
|
258 |
"</ul>"
|
259 |
"<p style='font-size: 1.1em;'>"
|
|
|
90 |
try:
|
91 |
gr.Info(f"Loading audio: {original_path_name}", duration=2)
|
92 |
audio = AudioSegment.from_file(audio_path)
|
93 |
+
duration_sec = audio.duration_seconds
|
94 |
except Exception as load_e:
|
95 |
gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
|
96 |
# Return an update to hide the button
|
|
|
138 |
transcribe_path = audio_path
|
139 |
info_path_name = original_path_name
|
140 |
|
141 |
+
# Flag to track if long audio settings were applied
|
142 |
+
long_audio_settings_applied = False
|
143 |
try:
|
144 |
model.to(device)
|
145 |
+
model.to(torch.float32)
|
146 |
gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
|
147 |
+
|
148 |
+
# Check duration and apply specific settings for long audio
|
149 |
+
if duration_sec > 900: # 15 minutes
|
150 |
+
try:
|
151 |
+
gr.Info("Audio longer than 15 minutes. Applying optimized settings for long transcription.", duration=3)
|
152 |
+
print("Applying long audio settings: Local Attention and Chunking.")
|
153 |
+
model.change_attention_model("rel_pos_local_attn", [256,256])
|
154 |
+
model.change_subsampling_conv_chunking_factor(1) # 1 = auto select
|
155 |
+
long_audio_settings_applied = True
|
156 |
+
except Exception as setting_e:
|
157 |
+
gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5)
|
158 |
+
print(f"Warning: Failed to apply long audio settings: {setting_e}")
|
159 |
+
# Proceed without long audio settings if applying them failed
|
160 |
+
|
161 |
+
model.to(torch.bfloat16)
|
162 |
output = model.transcribe([transcribe_path], timestamps=True)
|
163 |
|
164 |
if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
|
|
|
213 |
# Return an update to hide the button
|
214 |
return vis_data, raw_times_data, audio_path, gr.DownloadButton(visible=False)
|
215 |
finally:
|
216 |
+
# --- Model Cleanup ---
|
217 |
try:
|
218 |
+
# Revert settings if they were applied for long audio
|
219 |
+
if long_audio_settings_applied:
|
220 |
+
try:
|
221 |
+
print("Reverting long audio settings.")
|
222 |
+
model.change_attention_model("rel_pos", [-1,-1])
|
223 |
+
model.change_subsampling_conv_chunking_factor(-1)
|
224 |
+
long_audio_settings_applied = False # Reset flag
|
225 |
+
except Exception as revert_e:
|
226 |
+
print(f"Warning: Failed to revert long audio settings: {revert_e}")
|
227 |
+
gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5)
|
228 |
+
|
229 |
+
# Original cleanup
|
230 |
if 'model' in locals() and hasattr(model, 'cpu'):
|
231 |
if device == 'cuda':
|
232 |
model.cpu()
|
|
|
236 |
except Exception as cleanup_e:
|
237 |
print(f"Error during model cleanup: {cleanup_e}")
|
238 |
gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
|
239 |
+
# --- End Model Cleanup ---
|
240 |
|
241 |
finally:
|
242 |
if processed_audio_path and os.path.exists(processed_audio_path):
|
|
|
286 |
"<ul style='font-size: 1.1em;'>"
|
287 |
" <li>Automatic punctuation and capitalization</li>"
|
288 |
" <li>Accurate word-level timestamps (click on a segment in the table below to play it!)</li>"
|
289 |
+
" <li>Efficiently transcribes long audio segments (<strong>updated to support upto 3 hours</strong>) <small>(For even longer audios, see <a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py' target='_blank'>this script</a>)</small></li>"
|
290 |
" <li>Robust performance on spoken numbers, and song lyrics transcription </li>"
|
291 |
"</ul>"
|
292 |
"<p style='font-size: 1.1em;'>"
|