import torch import torchaudio from transformers import WhisperProcessor, WhisperForConditionalGeneration from transformers import MarianMTModel, MarianTokenizer import gradio as gr from pydub import AudioSegment import os # Load Whisper model for transcription whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small") whisper_model.eval() torch.set_grad_enabled(False) # Load MarianMT model for translation (English → Spanish) translation_model_name = "Helsinki-NLP/opus-mt-en-es" translator = MarianMTModel.from_pretrained(translation_model_name) tokenizer = MarianTokenizer.from_pretrained(translation_model_name) def translate_text(text): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) translated = translator.generate(**inputs) return tokenizer.decode(translated[0], skip_special_tokens=True) def transcribe_and_translate(filepath): if filepath is None or not os.path.exists(filepath): return "No audio file provided." audio = AudioSegment.from_file(filepath).set_channels(1) chunk_length_ms = 30 * 1000 chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] full_transcript = "" for i, chunk in enumerate(chunks): chunk_path = f"chunk_{i}.wav" chunk.export(chunk_path, format="wav") waveform, sr = torchaudio.load(chunk_path) os.remove(chunk_path) if sr != 16000: waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform) waveform = waveform.mean(dim=0) inputs = whisper_processor(waveform, sampling_rate=16000, return_tensors="pt") predicted_ids = whisper_model.generate(inputs["input_features"]) transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] full_transcript += transcription.strip() + " " translated_text = translate_text(full_transcript.strip()) return translated_text mic_ui = gr.Interface( fn=transcribe_and_translate, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"), ) file_ui = gr.Interface( fn=transcribe_and_translate, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"), ) app = gr.TabbedInterface([mic_ui, file_ui], ["Microphone Input", "Upload File"]) app.launch()