import os import torch import gradio as gr import numpy as np from transformers import WhisperProcessor, WhisperForConditionalGeneration from deep_translator import GoogleTranslator # Replaced googletrans with deep-translator from gtts import gTTS import librosa import tempfile import soundfile as sf class RealTimeTranslator: def __init__(self): # Initialize Whisper model for speech recognition (using tiny model for lower resource usage) self.processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") # Use GPU if available self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = self.model.to(self.device) # Supported languages self.languages = { 'en': 'English', 'fr': 'French', 'hi': 'Hindi', 'es': 'Spanish', 'de': 'German', 'ja': 'Japanese' } def speech_to_text(self, audio_path, source_lang): """Convert speech to text using Whisper""" try: # Load and preprocess audio audio, _ = librosa.load(audio_path, sr=16000) input_features = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_features input_features = input_features.to(self.device) # Generate token ids predicted_ids = self.model.generate(input_features) # Decode token ids to text transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True) return transcription[0] except Exception as e: return f"Error in speech-to-text: {str(e)}" def translate_text(self, text, source_lang, target_lang): """Translate text using Google Translate""" try: translation = GoogleTranslator(source=source_lang, target=target_lang).translate(text) return translation except Exception as e: return f"Error in translation: {str(e)}" def text_to_speech(self, text, target_lang): """Convert text to speech using gTTS""" try: with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp: tts = gTTS(text=text, lang=target_lang) tts.save(fp.name) return fp.name except Exception as e: return f"Error in text-to-speech: {str(e)}" def process_audio(self, audio, source_lang, target_lang): """Complete pipeline: Speech → Text → Translation → Speech""" try: if audio is None: return None, "No audio input received", "Please provide audio input" # Save input audio temporarily with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp: sf.write(fp.name, audio[1], audio[0]) audio_path = fp.name # Speech to text text = self.speech_to_text(audio_path, source_lang) if "Error" in text: return None, text, "" # Translate text translated_text = self.translate_text(text, source_lang, target_lang) if "Error" in translated_text: return None, text, translated_text # Text to speech output_audio_path = self.text_to_speech(translated_text, target_lang) if "Error" in output_audio_path: return None, text, translated_text # Load the generated audio output_audio, sr = librosa.load(output_audio_path) # Clean up temporary files os.unlink(audio_path) os.unlink(output_audio_path) return (sr, output_audio), text, translated_text except Exception as e: return None, f"Error: {str(e)}", f"Error: {str(e)}" def create_gradio_interface(): translator = RealTimeTranslator() # Create the Gradio interface demo = gr.Interface( fn=translator.process_audio, inputs=[ gr.Audio(sources=["microphone"], type="numpy", label="Input Audio"), gr.Dropdown(choices=list(translator.languages.keys()), value="en", label="Source Language"), gr.Dropdown(choices=list(translator.languages.keys()), value="fr", label="Target Language") ], outputs=[ gr.Audio(label="Translated Audio"), gr.Textbox(label="Original Text"), gr.Textbox(label="Translated Text") ], title="Real-time Language Translator", description="Speak in your language and get instant translation in the target language. Please ensure your device is set to speakerphone mode for best results.", examples=[ [None, "en", "fr"], [None, "hi", "en"], [None, "es", "ja"] ] ) return demo if __name__ == "__main__": demo = create_gradio_interface() demo.launch(share=True, debug=True)