|
import os |
|
import torch |
|
import gradio as gr |
|
import numpy as np |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
from deep_translator import GoogleTranslator |
|
from gtts import gTTS |
|
import librosa |
|
import tempfile |
|
import soundfile as sf |
|
|
|
class RealTimeTranslator: |
|
def __init__(self): |
|
|
|
self.processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") |
|
self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") |
|
|
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
self.model = self.model.to(self.device) |
|
|
|
|
|
self.languages = { |
|
'en': 'English', |
|
'fr': 'French', |
|
'hi': 'Hindi', |
|
'es': 'Spanish', |
|
'de': 'German', |
|
'ja': 'Japanese' |
|
} |
|
|
|
def speech_to_text(self, audio_path, source_lang): |
|
"""Convert speech to text using Whisper""" |
|
try: |
|
|
|
audio, _ = librosa.load(audio_path, sr=16000) |
|
input_features = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_features |
|
input_features = input_features.to(self.device) |
|
|
|
|
|
predicted_ids = self.model.generate(input_features) |
|
|
|
|
|
transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True) |
|
return transcription[0] |
|
except Exception as e: |
|
return f"Error in speech-to-text: {str(e)}" |
|
|
|
def translate_text(self, text, source_lang, target_lang): |
|
"""Translate text using Google Translate""" |
|
try: |
|
translation = GoogleTranslator(source=source_lang, target=target_lang).translate(text) |
|
return translation |
|
except Exception as e: |
|
return f"Error in translation: {str(e)}" |
|
|
|
def text_to_speech(self, text, target_lang): |
|
"""Convert text to speech using gTTS""" |
|
try: |
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp: |
|
tts = gTTS(text=text, lang=target_lang) |
|
tts.save(fp.name) |
|
return fp.name |
|
except Exception as e: |
|
return f"Error in text-to-speech: {str(e)}" |
|
|
|
def process_audio(self, audio, source_lang, target_lang): |
|
"""Complete pipeline: Speech β Text β Translation β Speech""" |
|
try: |
|
if audio is None: |
|
return None, "No audio input received", "Please provide audio input" |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp: |
|
sf.write(fp.name, audio[1], audio[0]) |
|
audio_path = fp.name |
|
|
|
|
|
text = self.speech_to_text(audio_path, source_lang) |
|
if "Error" in text: |
|
return None, text, "" |
|
|
|
|
|
translated_text = self.translate_text(text, source_lang, target_lang) |
|
if "Error" in translated_text: |
|
return None, text, translated_text |
|
|
|
|
|
output_audio_path = self.text_to_speech(translated_text, target_lang) |
|
if "Error" in output_audio_path: |
|
return None, text, translated_text |
|
|
|
|
|
output_audio, sr = librosa.load(output_audio_path) |
|
|
|
|
|
os.unlink(audio_path) |
|
os.unlink(output_audio_path) |
|
|
|
return (sr, output_audio), text, translated_text |
|
|
|
except Exception as e: |
|
return None, f"Error: {str(e)}", f"Error: {str(e)}" |
|
|
|
def create_gradio_interface(): |
|
translator = RealTimeTranslator() |
|
|
|
|
|
demo = gr.Interface( |
|
fn=translator.process_audio, |
|
inputs=[ |
|
gr.Audio(sources=["microphone"], type="numpy", label="Input Audio"), |
|
gr.Dropdown(choices=list(translator.languages.keys()), value="en", label="Source Language"), |
|
gr.Dropdown(choices=list(translator.languages.keys()), value="fr", label="Target Language") |
|
], |
|
outputs=[ |
|
gr.Audio(label="Translated Audio"), |
|
gr.Textbox(label="Original Text"), |
|
gr.Textbox(label="Translated Text") |
|
], |
|
title="Real-time Language Translator", |
|
description="Speak in your language and get instant translation in the target language. Please ensure your device is set to speakerphone mode for best results.", |
|
examples=[ |
|
[None, "en", "fr"], |
|
[None, "hi", "en"], |
|
[None, "es", "ja"] |
|
] |
|
) |
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = create_gradio_interface() |
|
demo.launch(share=True, debug=True) |