import gradio as gr import moviepy.editor as mp import librosa from transformers import pipeline from concurrent.futures import ThreadPoolExecutor # Load Whisper model for speech-to-text asr = pipeline("automatic-speech-recognition", model="openai/whisper-large") # MarianMT or M2M100 for translation (multi-language) translator = pipeline("translation", model="facebook/m2m100_418M") # Supported languages with their codes languages = { "Persian (fa)": "fa", "French (fr)": "fr", "Spanish (es)": "es", "German (de)": "de", "Chinese (zh)": "zh", "Arabic (ar)": "ar", "Hindi (hi)": "hi", "Russian (ru)": "ru" } def transcribe_audio(chunk): """Transcribe a single audio chunk.""" return asr(chunk)["text"] def generate_subtitles(video_file, language_name): try: # Extract the target language code from the selected language name target_language = languages[language_name] # Check if video_file is a file object or a file path string if isinstance(video_file, str): video_path = video_file # It's a file path else: video_path = video_file.name # It's a file object print(f"Processing video from path: {video_path}") # Extract audio from video using moviepy video = mp.VideoFileClip(video_path) audio_path = "temp_audio.wav" audio = video.audio audio.write_audiofile(audio_path, codec='pcm_s16le') print("Starting speech-to-text transcription") # Load the audio file as a waveform using librosa waveform, sr = librosa.load(audio_path, sr=16000) # sr=16000 for Whisper # Process audio in chunks chunk_duration = 15 # seconds chunk_size = sr * chunk_duration # number of samples per chunk chunks = [waveform[i:i + chunk_size] for i in range(0, len(waveform), chunk_size) if len(waveform[i:i + chunk_size]) > 0] # Use ThreadPoolExecutor for parallel processing with ThreadPoolExecutor() as executor: transcriptions = list(executor.map(transcribe_audio, chunks)) # Combine all transcriptions into a single string full_transcription = " ".join(transcriptions) print("Starting translation") # Translate transcription to the target language using M2M100 translated_subtitles = translator( full_transcription, src_lang="en", # Source language is English tgt_lang=target_language # Target language from user selection )[0]["translation_text"] # Return subtitles subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}" return subtitles except Exception as e: # Catch and log the error print(f"Error occurred: {e}") return f"Error occurred: {e}" # Define Gradio interface def subtitle_video(video_file, language_name): try: # Handle both file-like objects and file paths return generate_subtitles(video_file, language_name) except Exception as e: print(f"Error in processing video: {e}") return f"Error in processing video: {e}" # Gradio app layout interface = gr.Interface( fn=subtitle_video, inputs=[ gr.Video(label="Upload Video"), gr.Dropdown( # Dropdown for language selection label="Choose Target Language", choices=list(languages.keys()), # Display language names in the dropdown value="Persian (fa)" # Default language ) ], outputs="text", title="Automatic Video Subtitler & Translator" ) interface.launch()