import gradio as gr
import moviepy.editor as mp
import librosa
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor

# Load Whisper model for speech-to-text
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large")

# MarianMT or M2M100 for translation (multi-language)
translator = pipeline("translation", model="facebook/m2m100_418M")

# Supported languages with their codes
languages = {
    "Persian (fa)": "fa",
    "French (fr)": "fr",
    "Spanish (es)": "es",
    "German (de)": "de",
    "Chinese (zh)": "zh",
    "Arabic (ar)": "ar",
    "Hindi (hi)": "hi",
    "Russian (ru)": "ru"
}

def transcribe_audio(chunk):
    """Transcribe a single audio chunk."""
    return asr(chunk)["text"]

def generate_subtitles(video_file, language_name):
    try:
        # Extract the target language code from the selected language name
        target_language = languages[language_name]
        
        # Check if video_file is a file object or a file path string
        if isinstance(video_file, str):
            video_path = video_file  # It's a file path
        else:
            video_path = video_file.name  # It's a file object
        
        print(f"Processing video from path: {video_path}")

        # Extract audio from video using moviepy
        video = mp.VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        audio = video.audio
        audio.write_audiofile(audio_path, codec='pcm_s16le')

        print("Starting speech-to-text transcription")

        # Load the audio file as a waveform using librosa
        waveform, sr = librosa.load(audio_path, sr=16000)  # sr=16000 for Whisper

        # Process audio in chunks
        chunk_duration = 15  # seconds
        chunk_size = sr * chunk_duration  # number of samples per chunk
        chunks = [waveform[i:i + chunk_size] for i in range(0, len(waveform), chunk_size) if len(waveform[i:i + chunk_size]) > 0]

        # Use ThreadPoolExecutor for parallel processing
        with ThreadPoolExecutor() as executor:
            transcriptions = list(executor.map(transcribe_audio, chunks))

        # Combine all transcriptions into a single string
        full_transcription = " ".join(transcriptions)

        print("Starting translation")

        # Translate transcription to the target language using M2M100
        translated_subtitles = translator(
            full_transcription, 
            src_lang="en",  # Source language is English
            tgt_lang=target_language  # Target language from user selection
        )[0]["translation_text"]

        # Return subtitles
        subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}"
        return subtitles
    
    except Exception as e:
        # Catch and log the error
        print(f"Error occurred: {e}")
        return f"Error occurred: {e}"

# Define Gradio interface
def subtitle_video(video_file, language_name):
    try:
        # Handle both file-like objects and file paths
        return generate_subtitles(video_file, language_name)
    except Exception as e:
        print(f"Error in processing video: {e}")
        return f"Error in processing video: {e}"

# Gradio app layout
interface = gr.Interface(
    fn=subtitle_video,
    inputs=[
        gr.Video(label="Upload Video"),
        gr.Dropdown(  # Dropdown for language selection
            label="Choose Target Language",
            choices=list(languages.keys()),  # Display language names in the dropdown
            value="Persian (fa)"  # Default language
        )
    ],
    outputs="text",
    title="Automatic Video Subtitler & Translator"
)

interface.launch()