Spaces:

farbot16
/

video_translation

Sleeping

App Files Files Community

Farit Shamardanov commited on Jan 25

Commit

fde8fc4

1 Parent(s): ec3a891

Add application file

Browse files

Files changed (2) hide show

app.py +246 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,246 @@

+from shutil import which
+import gradio as gr
+from transformers import pipeline
+from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
+from TTS.api import TTS  # Coqui TTS
+import librosa
+import soundfile as sf
+import os
+import nltk
+import torch
+from pydub import AudioSegment
+nltk.download('punkt')
+nltk.download('punkt_tab')
+device = 0 if torch.cuda.is_available() else -1  # Использовать GPU (0) или CPU (-1)
+print("Используемый девайс:", "GPU" if device == 0 else "CPU")
+# Удаление мата из текста
+def detect_profanity_with_transformer(text):
+    profanity_detector = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-offensive", device=device)
+    words = text.split()
+    cleaned_words = []
+    for word in words:
+        result = profanity_detector(word)
+        if any(label["label"] == "OFFENSIVE" and label["score"] > 0.8 for label in result):
+            cleaned_words.append("***")  # Заменяем мат на звездочки
+        else:
+            cleaned_words.append(word)
+    return " ".join(cleaned_words)
+# Функция для извлечения аудио из видео
+def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
+    video = VideoFileClip(video_path)
+    video.audio.write_audiofile(audio_path)
+    return audio_path
+# Получение транскрипции и временных меток
+def get_transcription_with_timestamps(audio_path):
+    asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
+    result = asr(audio_path, return_timestamps=True)
+    transcription = result["text"]
+    timestamps = result["chunks"]  # Содержит временные метки для каждого слова или фрагмента
+    return transcription, timestamps
+# Разбиение текста на фрагменты по временным меткам
+def split_text_by_timestamps(timestamps):
+    text_fragments = []
+    for chunk in timestamps:
+        # Проверяем наличие ключа 'timestamp' и корректности данных
+        if "timestamp" in chunk and "text" in chunk:
+            start_time, end_time = chunk["timestamp"]
+            # Игнорируем фрагменты с отсутствующими временными метками
+            if start_time is None or end_time is None:
+                continue
+            fragment_text = chunk["text"]
+            # Добавляем только непустые текстовые фрагменты
+            if fragment_text.strip():
+                text_fragments.append({
+                    "start": start_time,
+                    "end": end_time,
+                    "text": fragment_text.strip()
+                })
+    return text_fragments
+# Перевод текста
+def translate_text_with_transformer(text, source_lang="ru", target_lang="en"):
+    translator = pipeline("translation", model="facebook/m2m100_418M", device=device)
+    translated_result = translator(text, src_lang=source_lang, tgt_lang=target_lang)
+    return translated_result[0]["translation_text"]
+# Синтез аудио с учетом временных меток и синхронизация с видео
+def synthesize_audio_with_timestamps(original_audio_path, text_fragments, output_audio_path):
+    from TTS.api import TTS
+    from pydub import AudioSegment
+    import os
+    import torch
+    tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
+    generated_clips = []
+    for fragment in text_fragments:
+        temp_audio_path = "temp_fragment.wav"
+        tts.tts_to_file(
+            text=fragment["text"],
+            file_path=temp_audio_path,
+            speaker_wav=original_audio_path,
+            language="en"
+        )
+        audio_segment = AudioSegment.from_file(temp_audio_path)
+        # Подгоняем длину аудио фрагмента к заданным временным рамкам
+        duration = fragment["end"] - fragment["start"]
+        # Проверка на нулевую или отрицательную длительность фрагмента
+        if duration <= 0:
+            print(f"Warning: duration is zero or negative for fragment: {fragment['text']}")
+            os.remove(temp_audio_path)
+            continue
+        audio_duration = len(audio_segment) / 1000  # Длительность в секундах
+        # Проверка на нулевую длительность аудио
+        if audio_duration <= 0:
+            print(f"Warning: audio duration is zero or negative for fragment: {fragment['text']}")
+            os.remove(temp_audio_path)
+            continue
+        # Корректировка длительности аудио
+        speed_factor = duration / audio_duration
+        if audio_duration < duration:
+            # Ускорение аудио
+            if speed_factor > 1e-6:
+                audio_segment = audio_segment.speedup(playback_speed=speed_factor)
+            else:
+                print(f"Warning: speed_factor is too small for fragment: {fragment['text']}")
+                os.remove(temp_audio_path)
+                continue
+        elif audio_duration > duration:
+            # Замедление аудио
+            if speed_factor > 1e-6:
+                audio_segment = audio_segment.speedup(playback_speed=1/speed_factor)
+            else:
+                print(f"Warning: speed_factor is too small for fragment: {fragment['text']}")
+                os.remove(temp_audio_path)
+                continue
+        # Проверка на слишком короткое аудио после изменения скорости
+        if len(audio_segment) == 0:
+            print(f"Warning: Audio segment became empty after speed adjustment for fragment: {fragment['text']}")
+            os.remove(temp_audio_path)
+            continue
+        generated_clips.append(audio_segment)
+        os.remove(temp_audio_path)
+    # Объединение всех фрагментов
+    if generated_clips:
+        final_audio = sum(generated_clips)
+        final_audio.export(output_audio_path, format="wav")
+    else:
+        print("No valid audio fragments to process.")
+# Синтез аудио с учетом временных меток без замедления
+def synthesize_audio_with_timestamps_simple(original_audio_path, text_fragments, output_audio_path):
+    tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
+    generated_clips = []
+    for fragment in text_fragments:
+        temp_audio_path = "temp_fragment.wav"
+        tts.tts_to_file(
+            text=fragment["text"],
+            file_path=temp_audio_path,
+            speaker_wav=original_audio_path,
+            language="en"
+        )
+        audio_segment = AudioSegment.from_file(temp_audio_path)
+        # Подгоняем длину аудио фрагмента к заданным временным рамкам
+        duration = fragment["end"] - fragment["start"]
+        audio_segment = audio_segment[:int(duration * 1000)]  # Приводим к миллисекундам
+        generated_clips.append(audio_segment)
+        os.remove(temp_audio_path)
+    # Объединение всех фрагментов
+    final_audio = sum(generated_clips)
+    final_audio.export(output_audio_path, format="wav")
+# Объединение видео с новым аудио
+def synchronize_video_with_audio(video_path, audio_path, output_path):
+    video = VideoFileClip(video_path)
+    audio = AudioFileClip(audio_path)
+    video = video.set_audio(audio)
+    video.write_videofile(output_path, codec="libx264", audio_codec="aac")
+# Основной процесс
+def translate_video_with_sync(video_path, output_path, source_lang="ru", target_lang="en"):
+    # Извлечение аудио из видео
+    audio_path = extract_audio_from_video(video_path)
+    # Получение транскрипции и временных меток
+    transcription, timestamps = get_transcription_with_timestamps(audio_path)
+    print("Распознанный текст:", transcription)
+    # Удаление мата из текста
+    cleaned_transcription = detect_profanity_with_transformer(transcription)
+    print("Очищенный текст:", cleaned_transcription)
+    # Перевод текста
+    translated_text = translate_text_with_transformer(cleaned_transcription, source_lang, target_lang)
+    print("Переведенный текст:", translated_text)
+    # Разбиение текста по временным меткам
+    text_fragments = split_text_by_timestamps(timestamps)
+    # Обновляем текст фрагментов с переводом
+    for fragment in text_fragments:
+        cleaned_text = detect_profanity_with_transformer(fragment["text"])
+        fragment["text"] = translate_text_with_transformer(cleaned_text, source_lang, target_lang)
+    # Генерация синхронизированного аудио
+    synthesized_audio_path = "synchronized_audio.wav"
+    synthesize_audio_with_timestamps_simple(audio_path, text_fragments, synthesized_audio_path)
+    # Объединение видео с новым аудио
+    synchronize_video_with_audio(video_path, synthesized_audio_path, output_path)
+    # Удаление временных файлов
+    os.remove(audio_path)
+    os.remove(synthesized_audio_path)
+    print(f"Переведенное видео сохранено в {output_path}")
+# Обёртка для функции `translate_video_with_sync`, чтобы она работала с Gradio
+def process_video(video_file, source_lang, target_lang):
+    input_path = video_file.name
+    output_path = "translated_video.mp4"
+    # Вызов основной функции
+    translate_video_with_sync(video_path=input_path, output_path=output_path, source_lang=source_lang, target_lang=target_lang)
+    # Возврат результата
+    return output_path
+# Интерфейс Gradio
+interface = gr.Interface(
+    fn=process_video,
+    inputs=[
+        gr.File(label="Upload Video", file_types=[".mp4", ".mkv", ".avi"]),  # Загрузка видео
+        gr.Textbox(label="Source Language (e.g., 'ru')", value="ru"),  # Исходный язык
+        gr.Textbox(label="Target Language (e.g., 'en')", value="en"),  # Целевой язык
+    ],
+    outputs=gr.File(label="Translated Video"),  # Вывод обработанного видео
+    title="Video Translation with Audio Sync",
+    description="Upload a video, specify the source and target languages, and generate a translated video with synchronized audio."
+)
+# Запуск интерфейса
+interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+gtts
+sacremoses
+TTS
+kenlm
+pyctcdecode
+espeakng