Spaces:

Tamerstito
/

translate-audio

Sleeping

App Files Files Community

Tamerstito commited on Apr 7

Commit

34f8d61

verified ·

1 Parent(s): c97e116

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -13

app.py CHANGED Viewed

@@ -1,31 +1,37 @@
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import torchaudio
 import torch
 import os
 import gradio as gr
 from pydub import AudioSegment
-# Load Whisper model and processor
-model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
-processor = WhisperProcessor.from_pretrained("openai/whisper-small")
-# Get decoder prompts for translation to Spanish
-forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
-# Function to process and translate audio
 import traceback
 def translate_audio(filepath):
     try:
         print("Received filepath:", filepath)
         if filepath is None or not os.path.exists(filepath):
             return "No audio file received or file does not exist."
         audio = AudioSegment.from_file(filepath)
         print("Audio loaded. Duration (ms):", len(audio))
-        chunk_length_ms = 30 * 1000
         chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
         print(f"Audio split into {len(chunks)} chunks.")
@@ -37,7 +43,13 @@ def translate_audio(filepath):
             print(f"Exported chunk {i} to {chunk_path}")
             waveform, sample_rate = torchaudio.load(chunk_path)
-            print(f"Loaded chunk {i} with sample rate {sample_rate}")
             inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
@@ -76,9 +88,8 @@ file_transcribe = gr.Interface(
     allow_flagging="never"
 )
-# Initialize Blocks properly
 demo = gr.Blocks()
 with demo:
     gr.TabbedInterface(
         [mic_transcribe, file_transcribe],

 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import torchaudio
+import torchaudio.transforms as T
 import torch
 import os
 import gradio as gr
 from pydub import AudioSegment
 import traceback
+# Lazy-loaded globals
+model = None
+processor = None
+forced_decoder_ids = None
 def translate_audio(filepath):
+    global model, processor, forced_decoder_ids
     try:
         print("Received filepath:", filepath)
         if filepath is None or not os.path.exists(filepath):
             return "No audio file received or file does not exist."
+        # Lazy load model and processor to reduce startup load time
+        if model is None:
+            print("Loading Whisper model...")
+            model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
+            processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+            forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
+            print("Model loaded.")
         audio = AudioSegment.from_file(filepath)
         print("Audio loaded. Duration (ms):", len(audio))
+        chunk_length_ms = 30 * 1000  # 30 seconds
         chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
         print(f"Audio split into {len(chunks)} chunks.")
             print(f"Exported chunk {i} to {chunk_path}")
             waveform, sample_rate = torchaudio.load(chunk_path)
+            # Resample if needed
+            if sample_rate != 16000:
+                print(f"Resampling from {sample_rate} Hz to 16000 Hz")
+                resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
+                waveform = resampler(waveform)
+                sample_rate = 16000
             inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
     allow_flagging="never"
 )
+# Create tabbed demo
 demo = gr.Blocks()
 with demo:
     gr.TabbedInterface(
         [mic_transcribe, file_transcribe],