File size: 5,063 Bytes
56df39f 5c73c9b 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 6ee873f 5c73c9b 6ee873f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 56df39f 7dd66b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import torch
import gradio as gr
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from deep_translator import GoogleTranslator # Replaced googletrans with deep-translator
from gtts import gTTS
import librosa
import tempfile
import soundfile as sf
class RealTimeTranslator:
def __init__(self):
# Initialize Whisper model for speech recognition (using tiny model for lower resource usage)
self.processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
# Use GPU if available
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.to(self.device)
# Supported languages
self.languages = {
'en': 'English',
'fr': 'French',
'hi': 'Hindi',
'es': 'Spanish',
'de': 'German',
'ja': 'Japanese'
}
def speech_to_text(self, audio_path, source_lang):
"""Convert speech to text using Whisper"""
try:
# Load and preprocess audio
audio, _ = librosa.load(audio_path, sr=16000)
input_features = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_features
input_features = input_features.to(self.device)
# Generate token ids
predicted_ids = self.model.generate(input_features)
# Decode token ids to text
transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription[0]
except Exception as e:
return f"Error in speech-to-text: {str(e)}"
def translate_text(self, text, source_lang, target_lang):
"""Translate text using Google Translate"""
try:
translation = GoogleTranslator(source=source_lang, target=target_lang).translate(text)
return translation
except Exception as e:
return f"Error in translation: {str(e)}"
def text_to_speech(self, text, target_lang):
"""Convert text to speech using gTTS"""
try:
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp:
tts = gTTS(text=text, lang=target_lang)
tts.save(fp.name)
return fp.name
except Exception as e:
return f"Error in text-to-speech: {str(e)}"
def process_audio(self, audio, source_lang, target_lang):
"""Complete pipeline: Speech β Text β Translation β Speech"""
try:
if audio is None:
return None, "No audio input received", "Please provide audio input"
# Save input audio temporarily
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp:
sf.write(fp.name, audio[1], audio[0])
audio_path = fp.name
# Speech to text
text = self.speech_to_text(audio_path, source_lang)
if "Error" in text:
return None, text, ""
# Translate text
translated_text = self.translate_text(text, source_lang, target_lang)
if "Error" in translated_text:
return None, text, translated_text
# Text to speech
output_audio_path = self.text_to_speech(translated_text, target_lang)
if "Error" in output_audio_path:
return None, text, translated_text
# Load the generated audio
output_audio, sr = librosa.load(output_audio_path)
# Clean up temporary files
os.unlink(audio_path)
os.unlink(output_audio_path)
return (sr, output_audio), text, translated_text
except Exception as e:
return None, f"Error: {str(e)}", f"Error: {str(e)}"
def create_gradio_interface():
translator = RealTimeTranslator()
# Create the Gradio interface
demo = gr.Interface(
fn=translator.process_audio,
inputs=[
gr.Audio(sources=["microphone"], type="numpy", label="Input Audio"),
gr.Dropdown(choices=list(translator.languages.keys()), value="en", label="Source Language"),
gr.Dropdown(choices=list(translator.languages.keys()), value="fr", label="Target Language")
],
outputs=[
gr.Audio(label="Translated Audio"),
gr.Textbox(label="Original Text"),
gr.Textbox(label="Translated Text")
],
title="Real-time Language Translator",
description="Speak in your language and get instant translation in the target language. Please ensure your device is set to speakerphone mode for best results.",
examples=[
[None, "en", "fr"],
[None, "hi", "en"],
[None, "es", "ja"]
]
)
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch(share=True, debug=True) |