Spaces:

farbot16
/

video_translation

Sleeping

Farit Shamardanov

Add application file

fde8fc4 about 2 months ago

11 kB

	from shutil import which
	import gradio as gr
	from transformers import pipeline
	from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
	from TTS.api import TTS # Coqui TTS
	import librosa
	import soundfile as sf
	import os
	import nltk
	import torch
	from pydub import AudioSegment

	nltk.download('punkt')
	nltk.download('punkt_tab')

	device = 0 if torch.cuda.is_available() else -1 # Использовать GPU (0) или CPU (-1)
	print("Используемый девайс:", "GPU" if device == 0 else "CPU")

	# Удаление мата из текста
	def detect_profanity_with_transformer(text):
	profanity_detector = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-offensive", device=device)
	words = text.split()
	cleaned_words = []

	for word in words:
	result = profanity_detector(word)
	if any(label["label"] == "OFFENSIVE" and label["score"] > 0.8 for label in result):
	cleaned_words.append("***") # Заменяем мат на звездочки
	else:
	cleaned_words.append(word)

	return " ".join(cleaned_words)

	# Функция для извлечения аудио из видео
	def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
	video = VideoFileClip(video_path)
	video.audio.write_audiofile(audio_path)
	return audio_path

	# Получение транскрипции и временных меток
	def get_transcription_with_timestamps(audio_path):
	asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
	result = asr(audio_path, return_timestamps=True)
	transcription = result["text"]
	timestamps = result["chunks"] # Содержит временные метки для каждого слова или фрагмента
	return transcription, timestamps

	# Разбиение текста на фрагменты по временным меткам
	def split_text_by_timestamps(timestamps):
	text_fragments = []
	for chunk in timestamps:
	# Проверяем наличие ключа 'timestamp' и корректности данных
	if "timestamp" in chunk and "text" in chunk:
	start_time, end_time = chunk["timestamp"]

	# Игнорируем фрагменты с отсутствующими временными метками
	if start_time is None or end_time is None:
	continue

	fragment_text = chunk["text"]

	# Добавляем только непустые текстовые фрагменты
	if fragment_text.strip():
	text_fragments.append({
	"start": start_time,
	"end": end_time,
	"text": fragment_text.strip()
	})

	return text_fragments

	# Перевод текста
	def translate_text_with_transformer(text, source_lang="ru", target_lang="en"):
	translator = pipeline("translation", model="facebook/m2m100_418M", device=device)
	translated_result = translator(text, src_lang=source_lang, tgt_lang=target_lang)
	return translated_result[0]["translation_text"]

	# Синтез аудио с учетом временных меток и синхронизация с видео
	def synthesize_audio_with_timestamps(original_audio_path, text_fragments, output_audio_path):
	from TTS.api import TTS
	from pydub import AudioSegment
	import os
	import torch

	tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
	generated_clips = []

	for fragment in text_fragments:
	temp_audio_path = "temp_fragment.wav"
	tts.tts_to_file(
	text=fragment["text"],
	file_path=temp_audio_path,
	speaker_wav=original_audio_path,
	language="en"
	)
	audio_segment = AudioSegment.from_file(temp_audio_path)

	# Подгоняем длину аудио фрагмента к заданным временным рамкам
	duration = fragment["end"] - fragment["start"]

	# Проверка на нулевую или отрицательную длительность фрагмента
	if duration <= 0:
	print(f"Warning: duration is zero or negative for fragment: {fragment['text']}")
	os.remove(temp_audio_path)
	continue

	audio_duration = len(audio_segment) / 1000 # Длительность в секундах

	# Проверка на нулевую длительность аудио
	if audio_duration <= 0:
	print(f"Warning: audio duration is zero or negative for fragment: {fragment['text']}")
	os.remove(temp_audio_path)
	continue

	# Корректировка длительности аудио
	speed_factor = duration / audio_duration
	if audio_duration < duration:
	# Ускорение аудио
	if speed_factor > 1e-6:
	audio_segment = audio_segment.speedup(playback_speed=speed_factor)
	else:
	print(f"Warning: speed_factor is too small for fragment: {fragment['text']}")
	os.remove(temp_audio_path)
	continue
	elif audio_duration > duration:
	# Замедление аудио
	if speed_factor > 1e-6:
	audio_segment = audio_segment.speedup(playback_speed=1/speed_factor)
	else:
	print(f"Warning: speed_factor is too small for fragment: {fragment['text']}")
	os.remove(temp_audio_path)
	continue

	# Проверка на слишком короткое аудио после изменения скорости
	if len(audio_segment) == 0:
	print(f"Warning: Audio segment became empty after speed adjustment for fragment: {fragment['text']}")
	os.remove(temp_audio_path)
	continue

	generated_clips.append(audio_segment)
	os.remove(temp_audio_path)

	# Объединение всех фрагментов
	if generated_clips:
	final_audio = sum(generated_clips)
	final_audio.export(output_audio_path, format="wav")
	else:
	print("No valid audio fragments to process.")

	# Синтез аудио с учетом временных меток без замедления
	def synthesize_audio_with_timestamps_simple(original_audio_path, text_fragments, output_audio_path):
	tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
	generated_clips = []

	for fragment in text_fragments:
	temp_audio_path = "temp_fragment.wav"
	tts.tts_to_file(
	text=fragment["text"],
	file_path=temp_audio_path,
	speaker_wav=original_audio_path,
	language="en"
	)
	audio_segment = AudioSegment.from_file(temp_audio_path)

	# Подгоняем длину аудио фрагмента к заданным временным рамкам
	duration = fragment["end"] - fragment["start"]
	audio_segment = audio_segment[:int(duration * 1000)] # Приводим к миллисекундам
	generated_clips.append(audio_segment)
	os.remove(temp_audio_path)

	# Объединение всех фрагментов
	final_audio = sum(generated_clips)
	final_audio.export(output_audio_path, format="wav")

	# Объединение видео с новым аудио
	def synchronize_video_with_audio(video_path, audio_path, output_path):
	video = VideoFileClip(video_path)
	audio = AudioFileClip(audio_path)
	video = video.set_audio(audio)
	video.write_videofile(output_path, codec="libx264", audio_codec="aac")

	# Основной процесс
	def translate_video_with_sync(video_path, output_path, source_lang="ru", target_lang="en"):
	# Извлечение аудио из видео
	audio_path = extract_audio_from_video(video_path)

	# Получение транскрипции и временных меток
	transcription, timestamps = get_transcription_with_timestamps(audio_path)
	print("Распознанный текст:", transcription)

	# Удаление мата из текста
	cleaned_transcription = detect_profanity_with_transformer(transcription)
	print("Очищенный текст:", cleaned_transcription)

	# Перевод текста
	translated_text = translate_text_with_transformer(cleaned_transcription, source_lang, target_lang)
	print("Переведенный текст:", translated_text)

	# Разбиение текста по временным меткам
	text_fragments = split_text_by_timestamps(timestamps)

	# Обновляем текст фрагментов с переводом
	for fragment in text_fragments:
	cleaned_text = detect_profanity_with_transformer(fragment["text"])
	fragment["text"] = translate_text_with_transformer(cleaned_text, source_lang, target_lang)

	# Генерация синхронизированного аудио
	synthesized_audio_path = "synchronized_audio.wav"
	synthesize_audio_with_timestamps_simple(audio_path, text_fragments, synthesized_audio_path)

	# Объединение видео с новым аудио
	synchronize_video_with_audio(video_path, synthesized_audio_path, output_path)

	# Удаление временных файлов
	os.remove(audio_path)
	os.remove(synthesized_audio_path)

	print(f"Переведенное видео сохранено в {output_path}")

	# Обёртка для функции `translate_video_with_sync`, чтобы она работала с Gradio
	def process_video(video_file, source_lang, target_lang):
	input_path = video_file.name
	output_path = "translated_video.mp4"

	# Вызов основной функции
	translate_video_with_sync(video_path=input_path, output_path=output_path, source_lang=source_lang, target_lang=target_lang)

	# Возврат результата
	return output_path

	# Интерфейс Gradio
	interface = gr.Interface(
	fn=process_video,
	inputs=[
	gr.File(label="Upload Video", file_types=[".mp4", ".mkv", ".avi"]), # Загрузка видео
	gr.Textbox(label="Source Language (e.g., 'ru')", value="ru"), # Исходный язык
	gr.Textbox(label="Target Language (e.g., 'en')", value="en"), # Целевой язык
	],
	outputs=gr.File(label="Translated Video"), # Вывод обработанного видео
	title="Video Translation with Audio Sync",
	description="Upload a video, specify the source and target languages, and generate a translated video with synchronized audio."
	)

	# Запуск интерфейса
	interface.launch()