|
import gradio as gr |
|
import os |
|
import torch |
|
import torchaudio |
|
from transformers import ( |
|
WhisperProcessor, WhisperForConditionalGeneration, |
|
MarianMTModel, MarianTokenizer, |
|
SpeechT5Processor, SpeechT5ForTextToSpeech |
|
) |
|
import ffmpeg |
|
import soundfile as sf |
|
import numpy as np |
|
|
|
|
|
UPLOAD_FOLDER = "uploads" |
|
OUTPUT_FOLDER = "outputs" |
|
os.makedirs(UPLOAD_FOLDER, exist_ok=True) |
|
os.makedirs(OUTPUT_FOLDER, exist_ok=True) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
torch.backends.cudnn.benchmark = True |
|
|
|
|
|
WHISPER_MODEL = "openai/whisper-tiny" |
|
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-pt" |
|
TTS_MODEL = "microsoft/speecht5_tts" |
|
|
|
|
|
print("Carregando modelos...") |
|
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL) |
|
whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL).to(device) |
|
|
|
translation_tokenizer = MarianTokenizer.from_pretrained(TRANSLATION_MODEL) |
|
translation_model = MarianMTModel.from_pretrained(TRANSLATION_MODEL).to(device) |
|
|
|
tts_processor = SpeechT5Processor.from_pretrained(TTS_MODEL) |
|
tts_model = SpeechT5ForTextToSpeech.from_pretrained(TTS_MODEL).to(device) |
|
|
|
|
|
def transcribe_audio(audio_path): |
|
waveform, sample_rate = torchaudio.load(audio_path) |
|
waveform = waveform.to(device) |
|
|
|
|
|
inputs = whisper_processor( |
|
waveform.squeeze().cpu().numpy(), |
|
sampling_rate=sample_rate, |
|
return_tensors="pt", |
|
chunk_length_s=30 |
|
).to(device) |
|
|
|
with torch.inference_mode(): |
|
predicted_ids = whisper_model.generate(**inputs) |
|
|
|
return whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
def translate_text(text): |
|
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device) |
|
with torch.inference_mode(): |
|
translated_ids = translation_model.generate(**inputs) |
|
return translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True) |
|
|
|
def synthesize_speech(text, output_path): |
|
inputs = tts_processor(text, return_tensors="pt").to(device) |
|
with torch.inference_mode(): |
|
speech = tts_model.generate_speech(inputs["input_ids"], tts_model.speaker_embeddings) |
|
|
|
|
|
sf.write(output_path, np.tile(speech.cpu().numpy(), (2, 1)).T, 44100, subtype='PCM_16') |
|
|
|
def process_video(video_path, output_path): |
|
|
|
( |
|
ffmpeg |
|
.input(video_path) |
|
.output(output_path, vcodec='copy', acodec='aac', strict='experimental') |
|
.global_args('-loglevel', 'error') |
|
.run(overwrite_output=True, cmd='ffmpeg') |
|
) |
|
|
|
|
|
def translate_video(video, progress=gr.Progress()): |
|
try: |
|
|
|
video_path = video |
|
|
|
|
|
progress(0.1, "Extraindo áudio...") |
|
audio_path = os.path.join(UPLOAD_FOLDER, "audio.wav") |
|
( |
|
ffmpeg |
|
.input(video_path) |
|
.output(audio_path, ac=1, ar=16000) |
|
.global_args('-loglevel', 'error') |
|
.run(overwrite_output=True) |
|
) |
|
|
|
|
|
progress(0.3, "Transcrevendo...") |
|
transcription = transcribe_audio(audio_path) |
|
|
|
|
|
progress(0.5, "Traduzindo...") |
|
translated_text = translate_text(transcription) |
|
|
|
|
|
progress(0.7, "Sintetizando voz...") |
|
synthesized_audio = os.path.join(UPLOAD_FOLDER, "synthesized_audio.wav") |
|
synthesize_speech(translated_text, synthesized_audio) |
|
|
|
|
|
progress(0.9, "Montando vídeo...") |
|
output_path = os.path.join(OUTPUT_FOLDER, "video_traduzido.mp4") |
|
( |
|
ffmpeg |
|
.input(video_path) |
|
.output(output_path, vcodec='copy', acodec='copy', map='0:v:0') |
|
.input(synthesized_audio) |
|
.global_args('-loglevel', 'error') |
|
.run(overwrite_output=True) |
|
) |
|
|
|
return output_path |
|
|
|
except Exception as e: |
|
return f"Erro: {str(e)}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=translate_video, |
|
inputs=gr.Video(label="Vídeo de Entrada"), |
|
outputs=gr.Video(label="Vídeo Traduzido"), |
|
title="🚀 Tradutor de Vídeo Ultra-Rápido", |
|
description="Carregue um vídeo e receba a versão em português com áudio traduzido!", |
|
allow_flagging="never" |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch(server_port=7860, show_error=True) |