File size: 5,064 Bytes
6c3da4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963a155
 
6c3da4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c65856d
 
 
 
 
 
 
 
 
 
6c3da4f
c65856d
6c3da4f
 
 
133d81e
6c3da4f
c7ace10
 
 
139252c
 
6c3da4f
 
 
139252c
 
6c3da4f
 
 
139252c
133d81e
6c3da4f
 
 
139252c
 
6c3da4f
 
 
139252c
133d81e
6c3da4f
 
 
133d81e
 
6c3da4f
 
 
 
 
 
 
 
 
 
 
 
 
 
39d7fcf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import os
import torch
import torchaudio
from transformers import (
    WhisperProcessor, WhisperForConditionalGeneration,
    SpeechT5Processor, SpeechT5ForTextToSpeech,
    MarianMTModel, MarianTokenizer
)
import ffmpeg
import soundfile as sf

# Configurações
UPLOAD_FOLDER = "uploads"
OUTPUT_FOLDER = "outputs"

if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# Verificar se a GPU está disponível
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

# Inicializar modelos
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)

tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)

# Usar um modelo alternativo de tradução
translation_model_name = "Helsinki-NLP/opus-mt-tc-big-en-pt"
translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
translation_model = MarianMTModel.from_pretrained(translation_model_name).to(device)

# Funções auxiliares
def transcribe_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = waveform.to(device)
    inputs = whisper_processor(waveform.squeeze().cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt").to(device)
    with torch.no_grad():
        predicted_ids = whisper_model.generate(inputs.input_features)
    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def synthesize_speech(text, output_path):
    inputs = tts_processor(text, return_tensors="pt").to(device)
    with torch.no_grad():
        speech = tts_model.generate_speech(inputs["input_ids"], tts_model.speaker_embeddings)
    sf.write(output_path, speech.cpu().numpy(), samplerate=22050)

def translate_text(text, target_language="pt"):
    inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        translated_ids = translation_model.generate(**inputs)
    translated_text = translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

def extract_audio(video_path, audio_path):
    ffmpeg.input(video_path).output(audio_path, ac=1, ar="16000").run(overwrite_output=True)

def replace_audio_in_video(video_path, audio_path, output_path):
    # Extrair a taxa de amostragem do áudio original
    probe = ffmpeg.probe(video_path)
    audio_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None)
    original_sample_rate = int(audio_stream['sample_rate'])

    # Converter o áudio sintetizado para a taxa de amostragem do vídeo original
    converted_audio_path = os.path.join(UPLOAD_FOLDER, "converted_audio.wav")
    ffmpeg.input(audio_path).output(converted_audio_path, ar=original_sample_rate).run(overwrite_output=True)

    # Substituir o áudio no vídeo
    video = ffmpeg.input(video_path)
    audio = ffmpeg.input(converted_audio_path)
    ffmpeg.output(video.video, audio.audio, output_path, vcodec="copy", acodec="aac").run(overwrite_output=True)

# Função principal para Gradio
def translate_video(video, progress=gr.Progress()):
    try:
        # O Gradio passa o caminho do arquivo como uma string
        video_path = video

        # Extrair áudio
        progress(0.1, desc="Extraindo áudio do vídeo...")
        audio_path = os.path.join(UPLOAD_FOLDER, "audio.wav")
        extract_audio(video_path, audio_path)

        # Transcrever áudio
        progress(0.3, desc="Transcrevendo áudio...")
        transcribed_text = transcribe_audio(audio_path)
        print("Texto transcrito:", transcribed_text)

        # Traduzir texto
        progress(0.5, desc="Traduzindo texto...")
        translated_text = translate_text(transcribed_text, target_language="pt")
        print("Texto traduzido:", translated_text)

        # Sintetizar áudio
        progress(0.7, desc="Sintetizando áudio em português...")
        synthesized_audio_path = os.path.join(UPLOAD_FOLDER, "synthesized_audio.wav")
        synthesize_speech(translated_text, synthesized_audio_path)

        # Substituir áudio no vídeo
        progress(0.9, desc="Substituindo áudio no vídeo...")
        output_video_path = os.path.join(OUTPUT_FOLDER, "translated_video.mp4")
        replace_audio_in_video(video_path, synthesized_audio_path, output_video_path)

        # Finalizar progresso
        progress(1.0, desc="Concluído!")
        return output_video_path

    except Exception as e:
        return f"Erro: {str(e)}"

# Interface Gradio
iface = gr.Interface(
    fn=translate_video,
    inputs=gr.Video(),
    outputs=gr.Video(),
    title="Tradutor de Vídeo",
    description="Carregue um vídeo em qualquer idioma e ele será traduzido para português."
)

iface.launch()