Spaces:

Artificial-superintelligence
/

Aita

Running

File size: 5,681 Bytes

import streamlit as st
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
import whisper
from translate import Translator
from gtts import gTTS
import tempfile
import os
import numpy as np

# Initialize Whisper model
try:
    whisper_model = whisper.load_model("base")
except Exception as e:
    st.error(f"Error loading Whisper model: {e}")

# Language options
LANGUAGES = {
    'English': 'en',
    'Tamil': 'ta',
    'Sinhala': 'si',
    'French': 'fr',  # Add more languages as needed
}

st.title("AI Video Translator with Whisper and GTTS")

# Step 1: Upload video file
video_file = st.file_uploader("Upload a video file", type=["mp4", "mov", "avi", "mkv"])

if video_file:
    # Step 2: Select translation language
    target_language = st.selectbox("Select the target language for translation", list(LANGUAGES.keys()))

    # Process when user clicks translate
    if st.button("Translate Video"):
        # Save video to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_video:
            temp_video.write(video_file.read())
            temp_video_path = temp_video.name

        # Extract audio from video
        try:
            video = VideoFileClip(temp_video_path)
            audio_path = tempfile.mktemp(suffix=".wav")
            video.audio.write_audiofile(audio_path)
        except Exception as e:
            st.error(f"Error extracting audio from video: {e}")
            os.remove(temp_video_path)
            st.stop()

        # Function to transcribe audio in chunks
        def transcribe_audio_in_chunks(audio_path, model, chunk_length=30):
            audio_clip = whisper.load_audio(audio_path)
            audio_duration = len(audio_clip) / whisper.audio.SAMPLE_RATE  # Calculate duration in seconds
            segments = []

            for start in np.arange(0, audio_duration, chunk_length):
                end = min(start + chunk_length, audio_duration)
                segment = audio_clip[int(start * whisper.audio.SAMPLE_RATE):int(end * whisper.audio.SAMPLE_RATE)]
                result = model.transcribe(segment)
                segments.append(result['text'])

            return ' '.join(segments)

        # Function to translate text in chunks
        def translate_in_chunks(text, translator, max_length=500):
            words = text.split()
            chunks = []
            current_chunk = ""

            for word in words:
                if len(current_chunk) + len(word) + 1 <= max_length:
                    current_chunk += " " + word if current_chunk else word
                else:
                    chunks.append(current_chunk)
                    current_chunk = word

            if current_chunk:
                chunks.append(current_chunk)

            translated_chunks = [translator.translate(chunk) for chunk in chunks]
            return ' '.join(translated_chunks)

        # Transcribe audio using Whisper
        try:
            original_text = transcribe_audio_in_chunks(audio_path, whisper_model)
            st.write("Original Transcription:", original_text)

            # Translate text to the target language
            translator = Translator(to_lang=LANGUAGES[target_language])
            translated_text = translate_in_chunks(original_text, translator)
            st.write(f"Translated Text ({target_language}):", translated_text)

            # Convert translated text to speech in chunks
            tts_clips = []
            words = translated_text.split()
            chunk = ""
            max_length = 200  # Adjust as needed

            for word in words:
                if len(chunk) + len(word) + 1 > max_length:
                    tts = gTTS(text=chunk, lang=LANGUAGES[target_language])
                    tts_audio_path = tempfile.mktemp(suffix=".mp3")
                    tts.save(tts_audio_path)
                    tts_clips.append(AudioFileClip(tts_audio_path))
                    chunk = word
                else:
                    chunk += " " + word if chunk else word

            if chunk:  # Process last chunk
                tts = gTTS(text=chunk, lang=LANGUAGES[target_language])
                tts_audio_path = tempfile.mktemp(suffix=".mp3")
                tts.save(tts_audio_path)
                tts_clips.append(AudioFileClip(tts_audio_path))

            # Concatenate all TTS audio chunks
            final_audio = concatenate_audioclips(tts_clips)
            translated_audio_path = tempfile.mktemp(suffix=".mp3")
            final_audio.write_audiofile(translated_audio_path)

            # Merge translated audio with the original video
            final_video_path = tempfile.mktemp(suffix=".mp4")
            original_video = VideoFileClip(temp_video_path)
            final_video = original_video.set_audio(AudioFileClip(translated_audio_path))
            final_video.write_videofile(final_video_path, codec='libx264', audio_codec='aac')

            # Display success message and provide download link
            st.success("Translation successful! Download your translated video below:")
            st.video(final_video_path)

            # Provide download link
            with open(final_video_path, "rb") as f:
                st.download_button("Download Translated Video", f, file_name="translated_video.mp4")

        except Exception as e:
            st.error(f"Error during transcription/translation: {e}")

        # Clean up temporary files
        for clip in tts_clips:
            os.remove(clip.filename)
        os.remove(temp_video_path)
        os.remove(audio_path)
        os.remove(translated_audio_path)
        os.remove(final_video_path)