Spaces:

Tamerstito
/

translate-audio

Sleeping

App Files Files Community

Tamerstito commited on Apr 7

Commit

df7a732

verified ·

1 Parent(s): 283d3e6

Upload 2 files

Browse files

Files changed (2) hide show

app.py +7 -24
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -20,60 +20,44 @@ def translate_audio(filepath):
         if filepath is None or not os.path.exists(filepath):
             return "No audio file received or file does not exist."
-        # Lazy-load model and processor
         if model is None:
             print("Loading Whisper model...")
             model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
             processor = WhisperProcessor.from_pretrained("openai/whisper-small")
             forced_decoder_ids = processor.get_decoder_prompt_ids(
-                task="translate", language="en"
             )
-            print("Model loaded and decoder ids set.")
         audio = AudioSegment.from_file(filepath).set_channels(1)
-        print("Audio loaded. Duration (ms):", len(audio))
         chunk_length_ms = 30 * 1000
         chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
-        print(f"Audio split into {len(chunks)} chunks.")
         full_translation = ""
         for i, chunk in enumerate(chunks):
             chunk_path = f"chunk_{i}.wav"
             chunk.export(chunk_path, format="wav")
-            print(f"Exported chunk {i} to {chunk_path}")
             waveform, sample_rate = torchaudio.load(chunk_path)
-            # Resample if necessary
             if sample_rate != 16000:
-                print(f"Resampling from {sample_rate} Hz to 16000 Hz")
-                resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
-                waveform = resampler(waveform)
-            # Convert to mono
             waveform = waveform.mean(dim=0)
-            inputs = processor(
-                waveform,
-                sampling_rate=16000,
-                return_tensors="pt"
-            )
             with torch.no_grad():
                 generated_ids = model.generate(
                     inputs["input_features"],
-                    forced_decoder_ids=forced_decoder_ids
                 )
             translation = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            print(f"Chunk {i} translation:", translation)
             full_translation += translation + " "
             os.remove(chunk_path)
-        print("Full translation done.")
         return full_translation.strip()
     except Exception as e:
@@ -81,7 +65,6 @@ def translate_audio(filepath):
         traceback.print_exc()
         return f"An error occurred: {str(e)}"
-# Gradio UI
 mic_transcribe = gr.Interface(
     fn=translate_audio,
     inputs=gr.Audio(sources="microphone", type="filepath"),

         if filepath is None or not os.path.exists(filepath):
             return "No audio file received or file does not exist."
+        # Load Whisper
         if model is None:
             print("Loading Whisper model...")
             model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
             processor = WhisperProcessor.from_pretrained("openai/whisper-small")
             forced_decoder_ids = processor.get_decoder_prompt_ids(
+                task="translate", language="es"
             )
+            print("Model and processor ready.")
         audio = AudioSegment.from_file(filepath).set_channels(1)
         chunk_length_ms = 30 * 1000
         chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
         full_translation = ""
         for i, chunk in enumerate(chunks):
             chunk_path = f"chunk_{i}.wav"
             chunk.export(chunk_path, format="wav")
             waveform, sample_rate = torchaudio.load(chunk_path)
             if sample_rate != 16000:
+                waveform = T.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
             waveform = waveform.mean(dim=0)
+            inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
             with torch.no_grad():
                 generated_ids = model.generate(
                     inputs["input_features"],
+                    forced_decoder_ids=forced_decoder_ids,
+                    suppress_tokens=[]
                 )
             translation = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             full_translation += translation + " "
             os.remove(chunk_path)
         return full_translation.strip()
     except Exception as e:
         traceback.print_exc()
         return f"An error occurred: {str(e)}"
 mic_transcribe = gr.Interface(
     fn=translate_audio,
     inputs=gr.Audio(sources="microphone", type="filepath"),

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-transformers==4.36.2
 torch
 torchaudio
-pydub
 gradio

 --extra-index-url https://download.pytorch.org/whl/cpu
 torch
 torchaudio
+transformers==4.36.2
 gradio
+pydub