Spaces:

Tamerstito
/

translate-audio

Sleeping

App Files Files Community

Tamerstito commited on Apr 7

Commit

d07df4a

verified ·

1 Parent(s): df7a732

Upload 2 files

Browse files

Files changed (1) hide show

app.py +56 -76

app.py CHANGED Viewed

@@ -1,90 +1,70 @@
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
-import torchaudio
-import torchaudio.transforms as T
 import torch
-import os
 import gradio as gr
 from pydub import AudioSegment
-import traceback
-# Lazy-load components
-model = None
-processor = None
-forced_decoder_ids = None
 def translate_audio(filepath):
-    global model, processor, forced_decoder_ids
-    try:
-        print("Received filepath:", filepath)
-        if filepath is None or not os.path.exists(filepath):
-            return "No audio file received or file does not exist."
-        # Load Whisper
-        if model is None:
-            print("Loading Whisper model...")
-            model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
-            processor = WhisperProcessor.from_pretrained("openai/whisper-small")
-            forced_decoder_ids = processor.get_decoder_prompt_ids(
-                task="translate", language="es"
-            )
-            print("Model and processor ready.")
-        audio = AudioSegment.from_file(filepath).set_channels(1)
-        chunk_length_ms = 30 * 1000
-        chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
-        full_translation = ""
-        for i, chunk in enumerate(chunks):
-            chunk_path = f"chunk_{i}.wav"
-            chunk.export(chunk_path, format="wav")
-            waveform, sample_rate = torchaudio.load(chunk_path)
-            if sample_rate != 16000:
-                waveform = T.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
-            waveform = waveform.mean(dim=0)
-            inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
-            with torch.no_grad():
-                generated_ids = model.generate(
-                    inputs["input_features"],
-                    forced_decoder_ids=forced_decoder_ids,
-                    suppress_tokens=[]
-                )
-            translation = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            full_translation += translation + " "
-            os.remove(chunk_path)
-        return full_translation.strip()
-    except Exception as e:
-        print("ERROR:", str(e))
-        traceback.print_exc()
-        return f"An error occurred: {str(e)}"
-mic_transcribe = gr.Interface(
     fn=translate_audio,
     inputs=gr.Audio(sources="microphone", type="filepath"),
-    outputs=gr.Textbox(label="Translation (English to Spanish)", lines=3),
-    allow_flagging="never"
 )
-file_transcribe = gr.Interface(
     fn=translate_audio,
     inputs=gr.Audio(sources="upload", type="filepath"),
-    outputs=gr.Textbox(label="Translation (English to Spanish)", lines=3),
-    allow_flagging="never"
 )
-demo = gr.Blocks()
-with demo:
-    gr.TabbedInterface(
-        [mic_transcribe, file_transcribe],
-        ["Translate Microphone", "Translate Audio File"]
-    )
-server_port = int(os.environ.get("PORT", 7860))
-demo.launch(share=True, server_port=server_port)

 import torch
+import torchaudio
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
 import gradio as gr
 from pydub import AudioSegment
+import os
+# Load model and processor
+model_id = "openai/whisper-small"
+model = WhisperForConditionalGeneration.from_pretrained(model_id)
+processor = WhisperProcessor.from_pretrained(model_id)
+# Set to eval mode and avoid grad
+model.eval()
+torch.set_grad_enabled(False)
+# Get decoder prompts for English to Spanish translation
+forced_decoder_ids = processor.get_decoder_prompt_ids(task="translate", language="es")
 def translate_audio(filepath):
+    if filepath is None or not os.path.exists(filepath):
+        return "No audio file received."
+    audio = AudioSegment.from_file(filepath).set_channels(1)
+    chunk_length_ms = 30 * 1000
+    chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+    final_output = ""
+    for idx, chunk in enumerate(chunks):
+        chunk_path = f"chunk_{idx}.wav"
+        chunk.export(chunk_path, format="wav")
+        waveform, sr = torchaudio.load(chunk_path)
+        os.remove(chunk_path)
+        # Resample if needed
+        if sr != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+            waveform = resampler(waveform)
+        waveform = waveform.mean(dim=0)  # convert to mono
+        inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
+        predicted_ids = model.generate(
+            inputs["input_features"],
+            forced_decoder_ids=forced_decoder_ids,
+            max_new_tokens=448
+        )
+        result = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        final_output += result.strip() + " "
+    return final_output.strip()
+mic_ui = gr.Interface(
     fn=translate_audio,
     inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Textbox(label="Translated Text (English to Spanish)"),
 )
+file_ui = gr.Interface(
     fn=translate_audio,
     inputs=gr.Audio(sources="upload", type="filepath"),
+    outputs=gr.Textbox(label="Translated Text (English to Spanish)"),
 )
+app = gr.TabbedInterface([mic_ui, file_ui], ["Microphone Input", "Upload File"])
+app.launch()