RXTIME commited on
Commit
6c3da4f
·
verified ·
1 Parent(s): 6b5ece4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import torch
4
+ import torchaudio
5
+ from transformers import (
6
+ WhisperProcessor, WhisperForConditionalGeneration,
7
+ SpeechT5Processor, SpeechT5ForTextToSpeech,
8
+ MarianMTModel, MarianTokenizer
9
+ )
10
+ import ffmpeg
11
+ import soundfile as sf
12
+
13
+ # Configurações
14
+ UPLOAD_FOLDER = "uploads"
15
+ OUTPUT_FOLDER = "outputs"
16
+
17
+ if not os.path.exists(UPLOAD_FOLDER):
18
+ os.makedirs(UPLOAD_FOLDER)
19
+ if not os.path.exists(OUTPUT_FOLDER):
20
+ os.makedirs(OUTPUT_FOLDER)
21
+
22
+ # Verificar se a GPU está disponível
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ print(f"Usando dispositivo: {device}")
25
+
26
+ # Inicializar modelos
27
+ whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
28
+ whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
29
+
30
+ tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
31
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
32
+
33
+ translation_model_name = "Helsinki-NLP/opus-mt-en-pt"
34
+ translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
35
+ translation_model = MarianMTModel.from_pretrained(translation_model_name).to(device)
36
+
37
+ # Funções auxiliares
38
+ def transcribe_audio(audio_path):
39
+ waveform, sample_rate = torchaudio.load(audio_path)
40
+ waveform = waveform.to(device)
41
+ inputs = whisper_processor(waveform.squeeze().cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt").to(device)
42
+ with torch.no_grad():
43
+ predicted_ids = whisper_model.generate(inputs.input_features)
44
+ transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
45
+ return transcription[0]
46
+
47
+ def synthesize_speech(text, output_path):
48
+ inputs = tts_processor(text, return_tensors="pt").to(device)
49
+ with torch.no_grad():
50
+ speech = tts_model.generate_speech(inputs["input_ids"], tts_model.speaker_embeddings)
51
+ sf.write(output_path, speech.cpu().numpy(), samplerate=22050)
52
+
53
+ def translate_text(text, target_language="pt"):
54
+ inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device)
55
+ with torch.no_grad():
56
+ translated_ids = translation_model.generate(**inputs)
57
+ translated_text = translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
58
+ return translated_text
59
+
60
+ def extract_audio(video_path, audio_path):
61
+ ffmpeg.input(video_path).output(audio_path, ac=1, ar="16000").run(overwrite_output=True)
62
+
63
+ def replace_audio_in_video(video_path, audio_path, output_path):
64
+ video = ffmpeg.input(video_path)
65
+ audio = ffmpeg.input(audio_path)
66
+ ffmpeg.output(video.video, audio.audio, output_path, vcodec="copy", acodec="aac").run(overwrite_output=True)
67
+
68
+ # Função principal para Gradio
69
+ def translate_video(video):
70
+ video_path = os.path.join(UPLOAD_FOLDER, "input_video.mp4")
71
+ with open(video_path, "wb") as f:
72
+ f.write(video)
73
+
74
+ try:
75
+ # Extrair áudio do vídeo
76
+ audio_path = os.path.join(UPLOAD_FOLDER, "audio.wav")
77
+ extract_audio(video_path, audio_path)
78
+
79
+ # Transcrever áudio
80
+ transcribed_text = transcribe_audio(audio_path)
81
+ print("Texto transcrito:", transcribed_text)
82
+
83
+ # Traduzir texto
84
+ translated_text = translate_text(transcribed_text, target_language="pt")
85
+ print("Texto traduzido:", translated_text)
86
+
87
+ # Sintetizar áudio em português
88
+ synthesized_audio_path = os.path.join(UPLOAD_FOLDER, "synthesized_audio.wav")
89
+ synthesize_speech(translated_text, synthesized_audio_path)
90
+
91
+ # Substituir áudio no vídeo
92
+ output_video_path = os.path.join(OUTPUT_FOLDER, "translated_video.mp4")
93
+ replace_audio_in_video(video_path, synthesized_audio_path, output_video_path)
94
+
95
+ return output_video_path
96
+
97
+ except Exception as e:
98
+ return f"Erro: {str(e)}"
99
+
100
+ # Interface Gradio
101
+ iface = gr.Interface(
102
+ fn=translate_video,
103
+ inputs=gr.Video(),
104
+ outputs=gr.Video(),
105
+ title="Tradutor de Vídeo",
106
+ description="Carregue um vídeo em qualquer idioma e ele será traduzido para português."
107
+ )
108
+
109
+ iface.launch()