translate-audio / app.py
Tamerstito's picture
Upload 2 files
2aa273f verified
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
import gradio as gr
from pydub import AudioSegment
import os
# Load Whisper model for transcription
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model.eval()
torch.set_grad_enabled(False)
# Load MarianMT model for translation (English → Spanish)
translation_model_name = "Helsinki-NLP/opus-mt-en-es"
translator = MarianMTModel.from_pretrained(translation_model_name)
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
def translate_text(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
translated = translator.generate(**inputs)
return tokenizer.decode(translated[0], skip_special_tokens=True)
def transcribe_and_translate(filepath):
if filepath is None or not os.path.exists(filepath):
return "No audio file provided."
audio = AudioSegment.from_file(filepath).set_channels(1)
chunk_length_ms = 30 * 1000
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
full_transcript = ""
for i, chunk in enumerate(chunks):
chunk_path = f"chunk_{i}.wav"
chunk.export(chunk_path, format="wav")
waveform, sr = torchaudio.load(chunk_path)
os.remove(chunk_path)
if sr != 16000:
waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
waveform = waveform.mean(dim=0)
inputs = whisper_processor(waveform, sampling_rate=16000, return_tensors="pt")
predicted_ids = whisper_model.generate(inputs["input_features"])
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
full_transcript += transcription.strip() + " "
translated_text = translate_text(full_transcript.strip())
return translated_text
mic_ui = gr.Interface(
fn=transcribe_and_translate,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"),
)
file_ui = gr.Interface(
fn=transcribe_and_translate,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"),
)
app = gr.TabbedInterface([mic_ui, file_ui], ["Microphone Input", "Upload File"])
app.launch()