Spaces:

younes21000
/

DAI_Project

Sleeping

File size: 3,864 Bytes

c9612c1
 
a5ff31f
9c779fd
c9612c1
e70c3e8
73c9093
c9612c1
 
 
 
 
 
 
38fcd18
 
 
 
 
 
 
 
 
 
 
 
e70c3e8
 
 
 
a037c67
0e83a05
 
 
 
a037c67
 
 
 
 
0e83a05
a037c67
 
9c779fd
0e83a05
 
 
73c9093
 
 
a037c67
73c9093
5ebcbb6
73c9093
 
 
 
 
 
 
 
 
 
 
5ebcbb6
 
 
c9612c1
0e83a05
c9612c1
0e83a05
e70c3e8
5ebcbb6
bd05d7b
 
0e83a05
c9612c1
0e83a05
5ebcbb6
0e83a05
 
 
 
 
 
c9612c1
 
38fcd18
0e83a05
a037c67
73e4f43
0e83a05
 
 
79fc358
c9612c1
 
 
 
 
35575b2
 
79fc358
 
 
c9612c1

import gradio as gr
import moviepy.editor as mp
import librosa
import numpy as np
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
import tempfile

# Load Whisper model for speech-to-text
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large")

# MarianMT or M2M100 for translation (multi-language)
translator = pipeline("translation", model="facebook/m2m100_418M")

# Supported languages with their codes
languages = {
    "Persian (fa)": "fa",
    "French (fr)": "fr",
    "Spanish (es)": "es",
    "German (de)": "de",
    "Chinese (zh)": "zh",
    "Arabic (ar)": "ar",
    "Hindi (hi)": "hi",
    "Russian (ru)": "ru"
}

def transcribe_audio(chunk):
    """Transcribe a single audio chunk."""
    return asr(chunk)["text"]

def generate_subtitles(video_file, language_name):
    try:
        # Extract the target language code from the selected language name
        target_language = languages[language_name]
        
        # Check if video_file is a file object or a file path string
        if isinstance(video_file, str):
            video_path = video_file  # It's a file path
        else:
            video_path = video_file.name  # It's a file object
        
        print(f"Processing video from path: {video_path}")

        # Load the video and extract audio directly
        video = mp.VideoFileClip(video_path)
        audio = video.audio

        # Use a temporary file to hold the audio data
        with tempfile.NamedTemporaryFile(delete=True) as tmp_audio_file:
            audio.write_audiofile(tmp_audio_file.name, codec='pcm_s16le')

            print("Starting speech-to-text transcription")

            # Load the audio file as a waveform using librosa
            waveform, sr = librosa.load(tmp_audio_file.name, sr=16000)  # sr=16000 for Whisper

            # Process audio in chunks
            chunk_duration = 15  # seconds
            chunk_size = sr * chunk_duration  # number of samples per chunk
            chunks = [waveform[i:i + chunk_size] for i in range(0, len(waveform), chunk_size) if len(waveform[i:i + chunk_size]) > 0]

            # Use ThreadPoolExecutor for parallel processing
            with ThreadPoolExecutor() as executor:
                transcriptions = list(executor.map(transcribe_audio, chunks))

        # Combine all transcriptions into a single string
        full_transcription = " ".join(transcriptions)

        print("Starting translation")

        # Translate transcription to the target language using M2M100
        translated_subtitles = translator(
            full_transcription, 
            src_lang="en",  # Source language is English
            tgt_lang=target_language  # Target language from user selection
        )[0]["translation_text"]

        # Return subtitles
        subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}"
        return subtitles
    
    except Exception as e:
        # Catch and log the error
        print(f"Error occurred: {e}")
        return f"Error occurred: {e}"

# Define Gradio interface
def subtitle_video(video_file, language_name):
    try:
        # Handle both file-like objects and file paths
        return generate_subtitles(video_file, language_name)
    except Exception as e:
        print(f"Error in processing video: {e}")
        return f"Error in processing video: {e}"

# Gradio app layout
interface = gr.Interface(
    fn=subtitle_video,
    inputs=[
        gr.Video(label="Upload Video"),
        gr.Dropdown(  # Dropdown for language selection
            label="Choose Target Language",
            choices=list(languages.keys()),  # Display language names in the dropdown
            value="Persian (fa)"  # Default language
        )
    ],
    outputs="text",
    title="Automatic Video Subtitler & Translator"
)

interface.launch()