Spaces:

farbot16
/

video_translation

Running

File size: 10,993 Bytes

fde8fc4

from shutil import which
import gradio as gr
from transformers import pipeline
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
from TTS.api import TTS  # Coqui TTS
import librosa
import soundfile as sf
import os
import nltk
import torch
from pydub import AudioSegment

nltk.download('punkt')
nltk.download('punkt_tab')

device = 0 if torch.cuda.is_available() else -1  # Использовать GPU (0) или CPU (-1)
print("Используемый девайс:", "GPU" if device == 0 else "CPU")

# Удаление мата из текста
def detect_profanity_with_transformer(text):
    profanity_detector = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-offensive", device=device)
    words = text.split()
    cleaned_words = []

    for word in words:
        result = profanity_detector(word)
        if any(label["label"] == "OFFENSIVE" and label["score"] > 0.8 for label in result):
            cleaned_words.append("***")  # Заменяем мат на звездочки
        else:
            cleaned_words.append(word)

    return " ".join(cleaned_words)

# Функция для извлечения аудио из видео
def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    return audio_path

# Получение транскрипции и временных меток
def get_transcription_with_timestamps(audio_path):
    asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
    result = asr(audio_path, return_timestamps=True)
    transcription = result["text"]
    timestamps = result["chunks"]  # Содержит временные метки для каждого слова или фрагмента
    return transcription, timestamps

# Разбиение текста на фрагменты по временным меткам
def split_text_by_timestamps(timestamps):
    text_fragments = []
    for chunk in timestamps:
        # Проверяем наличие ключа 'timestamp' и корректности данных
        if "timestamp" in chunk and "text" in chunk:
            start_time, end_time = chunk["timestamp"]

            # Игнорируем фрагменты с отсутствующими временными метками
            if start_time is None or end_time is None:
                continue

            fragment_text = chunk["text"]

            # Добавляем только непустые текстовые фрагменты
            if fragment_text.strip():
                text_fragments.append({
                    "start": start_time,
                    "end": end_time,
                    "text": fragment_text.strip()
                })

    return text_fragments

# Перевод текста
def translate_text_with_transformer(text, source_lang="ru", target_lang="en"):
    translator = pipeline("translation", model="facebook/m2m100_418M", device=device)
    translated_result = translator(text, src_lang=source_lang, tgt_lang=target_lang)
    return translated_result[0]["translation_text"]

# Синтез аудио с учетом временных меток и синхронизация с видео
def synthesize_audio_with_timestamps(original_audio_path, text_fragments, output_audio_path):
    from TTS.api import TTS
    from pydub import AudioSegment
    import os
    import torch

    tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
    generated_clips = []

    for fragment in text_fragments:
        temp_audio_path = "temp_fragment.wav"
        tts.tts_to_file(
            text=fragment["text"],
            file_path=temp_audio_path,
            speaker_wav=original_audio_path,
            language="en"
        )
        audio_segment = AudioSegment.from_file(temp_audio_path)

        # Подгоняем длину аудио фрагмента к заданным временным рамкам
        duration = fragment["end"] - fragment["start"]

        # Проверка на нулевую или отрицательную длительность фрагмента
        if duration <= 0:
            print(f"Warning: duration is zero or negative for fragment: {fragment['text']}")
            os.remove(temp_audio_path)
            continue

        audio_duration = len(audio_segment) / 1000  # Длительность в секундах

        # Проверка на нулевую длительность аудио
        if audio_duration <= 0:
            print(f"Warning: audio duration is zero or negative for fragment: {fragment['text']}")
            os.remove(temp_audio_path)
            continue

        # Корректировка длительности аудио
        speed_factor = duration / audio_duration
        if audio_duration < duration:
            # Ускорение аудио
            if speed_factor > 1e-6:
                audio_segment = audio_segment.speedup(playback_speed=speed_factor)
            else:
                print(f"Warning: speed_factor is too small for fragment: {fragment['text']}")
                os.remove(temp_audio_path)
                continue
        elif audio_duration > duration:
            # Замедление аудио
            if speed_factor > 1e-6:
                audio_segment = audio_segment.speedup(playback_speed=1/speed_factor)
            else:
                print(f"Warning: speed_factor is too small for fragment: {fragment['text']}")
                os.remove(temp_audio_path)
                continue

        # Проверка на слишком короткое аудио после изменения скорости
        if len(audio_segment) == 0:
            print(f"Warning: Audio segment became empty after speed adjustment for fragment: {fragment['text']}")
            os.remove(temp_audio_path)
            continue

        generated_clips.append(audio_segment)
        os.remove(temp_audio_path)

    # Объединение всех фрагментов
    if generated_clips:
        final_audio = sum(generated_clips)
        final_audio.export(output_audio_path, format="wav")
    else:
        print("No valid audio fragments to process.")

# Синтез аудио с учетом временных меток без замедления
def synthesize_audio_with_timestamps_simple(original_audio_path, text_fragments, output_audio_path):
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
    generated_clips = []

    for fragment in text_fragments:
        temp_audio_path = "temp_fragment.wav"
        tts.tts_to_file(
            text=fragment["text"],
            file_path=temp_audio_path,
            speaker_wav=original_audio_path,
            language="en"
        )
        audio_segment = AudioSegment.from_file(temp_audio_path)

        # Подгоняем длину аудио фрагмента к заданным временным рамкам
        duration = fragment["end"] - fragment["start"]
        audio_segment = audio_segment[:int(duration * 1000)]  # Приводим к миллисекундам
        generated_clips.append(audio_segment)
        os.remove(temp_audio_path)

    # Объединение всех фрагментов
    final_audio = sum(generated_clips)
    final_audio.export(output_audio_path, format="wav")

# Объединение видео с новым аудио
def synchronize_video_with_audio(video_path, audio_path, output_path):
    video = VideoFileClip(video_path)
    audio = AudioFileClip(audio_path)
    video = video.set_audio(audio)
    video.write_videofile(output_path, codec="libx264", audio_codec="aac")

# Основной процесс
def translate_video_with_sync(video_path, output_path, source_lang="ru", target_lang="en"):
    # Извлечение аудио из видео
    audio_path = extract_audio_from_video(video_path)

    # Получение транскрипции и временных меток
    transcription, timestamps = get_transcription_with_timestamps(audio_path)
    print("Распознанный текст:", transcription)

    # Удаление мата из текста
    cleaned_transcription = detect_profanity_with_transformer(transcription)
    print("Очищенный текст:", cleaned_transcription)

    # Перевод текста
    translated_text = translate_text_with_transformer(cleaned_transcription, source_lang, target_lang)
    print("Переведенный текст:", translated_text)

    # Разбиение текста по временным меткам
    text_fragments = split_text_by_timestamps(timestamps)

    # Обновляем текст фрагментов с переводом
    for fragment in text_fragments:
        cleaned_text = detect_profanity_with_transformer(fragment["text"])
        fragment["text"] = translate_text_with_transformer(cleaned_text, source_lang, target_lang)

    # Генерация синхронизированного аудио
    synthesized_audio_path = "synchronized_audio.wav"
    synthesize_audio_with_timestamps_simple(audio_path, text_fragments, synthesized_audio_path)

    # Объединение видео с новым аудио
    synchronize_video_with_audio(video_path, synthesized_audio_path, output_path)

    # Удаление временных файлов
    os.remove(audio_path)
    os.remove(synthesized_audio_path)

    print(f"Переведенное видео сохранено в {output_path}")

# Обёртка для функции `translate_video_with_sync`, чтобы она работала с Gradio
def process_video(video_file, source_lang, target_lang):
    input_path = video_file.name
    output_path = "translated_video.mp4"

    # Вызов основной функции
    translate_video_with_sync(video_path=input_path, output_path=output_path, source_lang=source_lang, target_lang=target_lang)

    # Возврат результата
    return output_path

# Интерфейс Gradio
interface = gr.Interface(
    fn=process_video,
    inputs=[
        gr.File(label="Upload Video", file_types=[".mp4", ".mkv", ".avi"]),  # Загрузка видео
        gr.Textbox(label="Source Language (e.g., 'ru')", value="ru"),  # Исходный язык
        gr.Textbox(label="Target Language (e.g., 'en')", value="en"),  # Целевой язык
    ],
    outputs=gr.File(label="Translated Video"),  # Вывод обработанного видео
    title="Video Translation with Audio Sync",
    description="Upload a video, specify the source and target languages, and generate a translated video with synchronized audio."
)

# Запуск интерфейса
interface.launch()