File size: 4,961 Bytes
42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d 39b8c2e 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 42d7d7d bb64571 9796a25 42d7d7d bb64571 42d7d7d 6c3da4f bb64571 42d7d7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
import os
import torch
import torchaudio
from transformers import (
WhisperProcessor, WhisperForConditionalGeneration,
MarianMTModel, MarianTokenizer,
SpeechT5Processor, SpeechT5ForTextToSpeech
)
import ffmpeg
import soundfile as sf
import numpy as np
# Configurações
UPLOAD_FOLDER = "uploads"
OUTPUT_FOLDER = "outputs"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Configurar dispositivo (GPU se disponível)
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True # Acelera GPU NVIDIA
# Carregar modelos uma única vez (cache)
WHISPER_MODEL = "openai/whisper-tiny" # Modelo mais rápido
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-pt" # Modelo alternativo
TTS_MODEL = "microsoft/speecht5_tts"
# Inicialização rápida dos modelos
print("Carregando modelos...")
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL).to(device)
translation_tokenizer = MarianTokenizer.from_pretrained(TRANSLATION_MODEL)
translation_model = MarianMTModel.from_pretrained(TRANSLATION_MODEL).to(device)
tts_processor = SpeechT5Processor.from_pretrained(TTS_MODEL)
tts_model = SpeechT5ForTextToSpeech.from_pretrained(TTS_MODEL).to(device)
# Funções otimizadas
def transcribe_audio(audio_path):
waveform, sample_rate = torchaudio.load(audio_path)
waveform = waveform.to(device)
# Processamento em chunks para áudios longos
inputs = whisper_processor(
waveform.squeeze().cpu().numpy(),
sampling_rate=sample_rate,
return_tensors="pt",
chunk_length_s=30 # Processar em chunks de 30 segundos
).to(device)
with torch.inference_mode():
predicted_ids = whisper_model.generate(**inputs)
return whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
def translate_text(text):
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device)
with torch.inference_mode():
translated_ids = translation_model.generate(**inputs)
return translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
def synthesize_speech(text, output_path):
inputs = tts_processor(text, return_tensors="pt").to(device)
with torch.inference_mode():
speech = tts_model.generate_speech(inputs["input_ids"], tts_model.speaker_embeddings)
# Converter para formato compatível com vídeo (44100 Hz, stereo)
sf.write(output_path, np.tile(speech.cpu().numpy(), (2, 1)).T, 44100, subtype='PCM_16')
def process_video(video_path, output_path):
# Processamento paralelizado com FFmpeg
(
ffmpeg
.input(video_path)
.output(output_path, vcodec='copy', acodec='aac', strict='experimental')
.global_args('-loglevel', 'error') # Reduzir logs
.run(overwrite_output=True, cmd='ffmpeg')
)
# Fluxo principal otimizado
def translate_video(video, progress=gr.Progress()):
try:
# O Gradio passa o caminho do arquivo como uma string
video_path = video
# Etapa 1: Extrair áudio
progress(0.1, "Extraindo áudio...")
audio_path = os.path.join(UPLOAD_FOLDER, "audio.wav")
(
ffmpeg
.input(video_path)
.output(audio_path, ac=1, ar=16000)
.global_args('-loglevel', 'error')
.run(overwrite_output=True)
)
# Etapa 2: Transcrição paralela
progress(0.3, "Transcrevendo...")
transcription = transcribe_audio(audio_path)
# Etapa 3: Tradução em lote
progress(0.5, "Traduzindo...")
translated_text = translate_text(transcription)
# Etapa 4: Síntese de voz acelerada
progress(0.7, "Sintetizando voz...")
synthesized_audio = os.path.join(UPLOAD_FOLDER, "synthesized_audio.wav")
synthesize_speech(translated_text, synthesized_audio)
# Etapa 5: Processamento final do vídeo
progress(0.9, "Montando vídeo...")
output_path = os.path.join(OUTPUT_FOLDER, "video_traduzido.mp4")
(
ffmpeg
.input(video_path)
.output(output_path, vcodec='copy', acodec='copy', map='0:v:0')
.input(synthesized_audio)
.global_args('-loglevel', 'error')
.run(overwrite_output=True)
)
return output_path
except Exception as e:
return f"Erro: {str(e)}"
# Interface otimizada
iface = gr.Interface(
fn=translate_video,
inputs=gr.Video(label="Vídeo de Entrada"),
outputs=gr.Video(label="Vídeo Traduzido"),
title="🚀 Tradutor de Vídeo Ultra-Rápido",
description="Carregue um vídeo e receba a versão em português com áudio traduzido!",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch(server_port=7860, show_error=True) |