|
from transformers import pipeline, MarianMTModel, MarianTokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
import torch |
|
import ffmpeg |
|
import os |
|
from pydub import AudioSegment |
|
|
|
|
|
def transcribe_audio(audio_path): |
|
print("Transcrevendo áudio...") |
|
|
|
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2") |
|
|
|
|
|
result = asr(audio_path, chunk_length_s=30) |
|
text = result['text'] |
|
print(f"Texto transcrito: {text}") |
|
return text |
|
|
|
|
|
|
|
def translate_text(text): |
|
print("Traduzindo texto para o português...") |
|
|
|
model_name = "Helsinki-NLP/opus-mt-en-pt" |
|
tokenizer = MarianTokenizer.from_pretrained(model_name) |
|
model = MarianMTModel.from_pretrained(model_name) |
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
translated_ids = model.generate(**inputs) |
|
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True) |
|
print(f"Texto traduzido: {translated_text}") |
|
return translated_text |
|
|
|
|
|
|
|
def synthesize_speech(text, output_path="output_speech.wav"): |
|
print("Sintetizando voz em português...") |
|
|
|
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") |
|
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
speaker_embeddings = torch.randn((1, 512)) |
|
|
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) |
|
|
|
|
|
speech_np = speech.numpy() |
|
AudioSegment(speech_np.tobytes(), frame_rate=16000, sample_width=2, channels=1).export(output_path, format="wav") |
|
print(f"Áudio sintetizado salvo em: {output_path}") |
|
return output_path |
|
|
|
|
|
|
|
def replace_audio_in_video(input_video, new_audio, output_video): |
|
print("Substituindo áudio no vídeo...") |
|
try: |
|
|
|
( |
|
ffmpeg |
|
.input(input_video) |
|
.output(new_audio, output_video, map_video=0, map_audio=1) |
|
.run(overwrite_output=True) |
|
) |
|
print(f"Vídeo com áudio substituído salvo em: {output_video}") |
|
except Exception as e: |
|
print(f"Erro ao substituir áudio: {e}") |
|
|
|
|
|
|
|
def main(): |
|
|
|
input_video_path = "input_video.mp4" |
|
temp_audio_path = "temp_audio.wav" |
|
output_video_path = "output_video.mp4" |
|
|
|
|
|
print("Extraindo áudio do vídeo...") |
|
( |
|
ffmpeg |
|
.input(input_video_path) |
|
.output(temp_audio_path) |
|
.run(overwrite_output=True) |
|
) |
|
|
|
|
|
transcribed_text = transcribe_audio(temp_audio_path) |
|
|
|
|
|
translated_text = translate_text(transcribed_text) |
|
|
|
|
|
synthesized_audio_path = synthesize_speech(translated_text, output_path="synthesized_audio.wav") |
|
|
|
|
|
replace_audio_in_video(input_video_path, synthesized_audio_path, output_video_path) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|