File size: 5,063 Bytes
56df39f
 
 
 
 
5c73c9b
56df39f
 
 
 
 
 
 
7dd66b8
 
 
56df39f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dd66b8
 
 
 
 
56df39f
7dd66b8
 
56df39f
7dd66b8
 
 
 
 
56df39f
 
7dd66b8
6ee873f
5c73c9b
 
6ee873f
7dd66b8
56df39f
 
 
7dd66b8
 
 
 
 
 
 
56df39f
 
 
 
 
 
 
 
 
 
 
 
 
 
7dd66b8
 
56df39f
 
 
7dd66b8
 
56df39f
 
 
7dd66b8
 
56df39f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dd66b8
56df39f
 
 
7dd66b8
56df39f
 
 
 
 
 
 
 
 
7dd66b8
56df39f
 
 
 
 
 
 
 
 
 
7dd66b8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import torch
import gradio as gr
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from deep_translator import GoogleTranslator  # Replaced googletrans with deep-translator
from gtts import gTTS
import librosa
import tempfile
import soundfile as sf

class RealTimeTranslator:
    def __init__(self):
        # Initialize Whisper model for speech recognition (using tiny model for lower resource usage)
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
        self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

        # Use GPU if available
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)

        # Supported languages
        self.languages = {
            'en': 'English',
            'fr': 'French',
            'hi': 'Hindi',
            'es': 'Spanish',
            'de': 'German',
            'ja': 'Japanese'
        }

    def speech_to_text(self, audio_path, source_lang):
        """Convert speech to text using Whisper"""
        try:
            # Load and preprocess audio
            audio, _ = librosa.load(audio_path, sr=16000)
            input_features = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_features
            input_features = input_features.to(self.device)

            # Generate token ids
            predicted_ids = self.model.generate(input_features)

            # Decode token ids to text
            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
            return transcription[0]
        except Exception as e:
            return f"Error in speech-to-text: {str(e)}"

    def translate_text(self, text, source_lang, target_lang):
        """Translate text using Google Translate"""
        try:
            translation = GoogleTranslator(source=source_lang, target=target_lang).translate(text)
            return translation
        except Exception as e:
            return f"Error in translation: {str(e)}"

    def text_to_speech(self, text, target_lang):
        """Convert text to speech using gTTS"""
        try:
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp:
                tts = gTTS(text=text, lang=target_lang)
                tts.save(fp.name)
                return fp.name
        except Exception as e:
            return f"Error in text-to-speech: {str(e)}"

    def process_audio(self, audio, source_lang, target_lang):
        """Complete pipeline: Speech β†’ Text β†’ Translation β†’ Speech"""
        try:
            if audio is None:
                return None, "No audio input received", "Please provide audio input"

            # Save input audio temporarily
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp:
                sf.write(fp.name, audio[1], audio[0])
                audio_path = fp.name

            # Speech to text
            text = self.speech_to_text(audio_path, source_lang)
            if "Error" in text:
                return None, text, ""

            # Translate text
            translated_text = self.translate_text(text, source_lang, target_lang)
            if "Error" in translated_text:
                return None, text, translated_text

            # Text to speech
            output_audio_path = self.text_to_speech(translated_text, target_lang)
            if "Error" in output_audio_path:
                return None, text, translated_text

            # Load the generated audio
            output_audio, sr = librosa.load(output_audio_path)

            # Clean up temporary files
            os.unlink(audio_path)
            os.unlink(output_audio_path)

            return (sr, output_audio), text, translated_text

        except Exception as e:
            return None, f"Error: {str(e)}", f"Error: {str(e)}"

def create_gradio_interface():
    translator = RealTimeTranslator()

    # Create the Gradio interface
    demo = gr.Interface(
        fn=translator.process_audio,
        inputs=[
            gr.Audio(sources=["microphone"], type="numpy", label="Input Audio"),
            gr.Dropdown(choices=list(translator.languages.keys()), value="en", label="Source Language"),
            gr.Dropdown(choices=list(translator.languages.keys()), value="fr", label="Target Language")
        ],
        outputs=[
            gr.Audio(label="Translated Audio"),
            gr.Textbox(label="Original Text"),
            gr.Textbox(label="Translated Text")
        ],
        title="Real-time Language Translator",
        description="Speak in your language and get instant translation in the target language. Please ensure your device is set to speakerphone mode for best results.",
        examples=[
            [None, "en", "fr"],
            [None, "hi", "en"],
            [None, "es", "ja"]
        ]
    )
    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(share=True, debug=True)