File size: 4,494 Bytes
6c3da4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963a155
 
6c3da4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133d81e
6c3da4f
c7ace10
 
 
133d81e
 
6c3da4f
 
 
133d81e
 
6c3da4f
 
 
133d81e
 
6c3da4f
 
 
133d81e
 
6c3da4f
 
 
133d81e
 
6c3da4f
 
 
133d81e
 
6c3da4f
 
 
 
 
 
 
 
 
 
 
 
 
 
39d7fcf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import os
import torch
import torchaudio
from transformers import (
    WhisperProcessor, WhisperForConditionalGeneration,
    SpeechT5Processor, SpeechT5ForTextToSpeech,
    MarianMTModel, MarianTokenizer
)
import ffmpeg
import soundfile as sf

# Configurações
UPLOAD_FOLDER = "uploads"
OUTPUT_FOLDER = "outputs"

if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# Verificar se a GPU está disponível
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

# Inicializar modelos
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)

tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)

# Usar um modelo alternativo de tradução
translation_model_name = "Helsinki-NLP/opus-mt-tc-big-en-pt"
translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
translation_model = MarianMTModel.from_pretrained(translation_model_name).to(device)

# Funções auxiliares
def transcribe_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = waveform.to(device)
    inputs = whisper_processor(waveform.squeeze().cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt").to(device)
    with torch.no_grad():
        predicted_ids = whisper_model.generate(inputs.input_features)
    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def synthesize_speech(text, output_path):
    inputs = tts_processor(text, return_tensors="pt").to(device)
    with torch.no_grad():
        speech = tts_model.generate_speech(inputs["input_ids"], tts_model.speaker_embeddings)
    sf.write(output_path, speech.cpu().numpy(), samplerate=22050)

def translate_text(text, target_language="pt"):
    inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        translated_ids = translation_model.generate(**inputs)
    translated_text = translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

def extract_audio(video_path, audio_path):
    ffmpeg.input(video_path).output(audio_path, ac=1, ar="16000").run(overwrite_output=True)

def replace_audio_in_video(video_path, audio_path, output_path):
    video = ffmpeg.input(video_path)
    audio = ffmpeg.input(audio_path)
    ffmpeg.output(video.video, audio.audio, output_path, vcodec="copy", acodec="aac").run(overwrite_output=True)

# Função principal para Gradio
def translate_video(video, progress=gr.Progress()):
    try:
        # O Gradio passa o caminho do arquivo como uma string
        video_path = video

        # Atualizar progresso
        progress(0, desc="Extraindo áudio do vídeo...")
        audio_path = os.path.join(UPLOAD_FOLDER, "audio.wav")
        extract_audio(video_path, audio_path)

        # Atualizar progresso
        progress(0.25, desc="Transcrevendo áudio...")
        transcribed_text = transcribe_audio(audio_path)
        print("Texto transcrito:", transcribed_text)

        # Atualizar progresso
        progress(0.5, desc="Traduzindo texto...")
        translated_text = translate_text(transcribed_text, target_language="pt")
        print("Texto traduzido:", translated_text)

        # Atualizar progresso
        progress(0.75, desc="Sintetizando áudio em português...")
        synthesized_audio_path = os.path.join(UPLOAD_FOLDER, "synthesized_audio.wav")
        synthesize_speech(translated_text, synthesized_audio_path)

        # Atualizar progresso
        progress(0.9, desc="Substituindo áudio no vídeo...")
        output_video_path = os.path.join(OUTPUT_FOLDER, "translated_video.mp4")
        replace_audio_in_video(video_path, synthesized_audio_path, output_video_path)

        # Finalizar progresso
        progress(1.0, desc="Concluído!")
        return output_video_path

    except Exception as e:
        return f"Erro: {str(e)}"

# Interface Gradio
iface = gr.Interface(
    fn=translate_video,
    inputs=gr.Video(),
    outputs=gr.Video(),
    title="Tradutor de Vídeo",
    description="Carregue um vídeo em qualquer idioma e ele será traduzido para português."
)

iface.launch()