Spaces:
Sleeping
Sleeping
import gradio as gr | |
import moviepy.editor as mp | |
import librosa | |
import numpy as np | |
from transformers import pipeline | |
from concurrent.futures import ThreadPoolExecutor | |
import tempfile | |
# Load Whisper model for speech-to-text | |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large") | |
# MarianMT or M2M100 for translation (multi-language) | |
translator = pipeline("translation", model="facebook/m2m100_418M") | |
# Supported languages with their codes | |
languages = { | |
"Persian (fa)": "fa", | |
"French (fr)": "fr", | |
"Spanish (es)": "es", | |
"German (de)": "de", | |
"Chinese (zh)": "zh", | |
"Arabic (ar)": "ar", | |
"Hindi (hi)": "hi", | |
"Russian (ru)": "ru" | |
} | |
def transcribe_audio(chunk): | |
"""Transcribe a single audio chunk.""" | |
return asr(chunk)["text"] | |
def generate_subtitles(video_file, language_name): | |
try: | |
# Extract the target language code from the selected language name | |
target_language = languages[language_name] | |
# Check if video_file is a file object or a file path string | |
if isinstance(video_file, str): | |
video_path = video_file # It's a file path | |
else: | |
video_path = video_file.name # It's a file object | |
print(f"Processing video from path: {video_path}") | |
# Load the video and extract audio directly | |
video = mp.VideoFileClip(video_path) | |
audio = video.audio | |
# Use a temporary file to hold the audio data | |
with tempfile.NamedTemporaryFile(delete=True) as tmp_audio_file: | |
audio.write_audiofile(tmp_audio_file.name, codec='pcm_s16le') | |
print("Starting speech-to-text transcription") | |
# Load the audio file as a waveform using librosa | |
waveform, sr = librosa.load(tmp_audio_file.name, sr=16000) # sr=16000 for Whisper | |
# Process audio in chunks | |
chunk_duration = 15 # seconds | |
chunk_size = sr * chunk_duration # number of samples per chunk | |
chunks = [waveform[i:i + chunk_size] for i in range(0, len(waveform), chunk_size) if len(waveform[i:i + chunk_size]) > 0] | |
# Use ThreadPoolExecutor for parallel processing | |
with ThreadPoolExecutor() as executor: | |
transcriptions = list(executor.map(transcribe_audio, chunks)) | |
# Combine all transcriptions into a single string | |
full_transcription = " ".join(transcriptions) | |
print("Starting translation") | |
# Translate transcription to the target language using M2M100 | |
translated_subtitles = translator( | |
full_transcription, | |
src_lang="en", # Source language is English | |
tgt_lang=target_language # Target language from user selection | |
)[0]["translation_text"] | |
# Return subtitles | |
subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}" | |
return subtitles | |
except Exception as e: | |
# Catch and log the error | |
print(f"Error occurred: {e}") | |
return f"Error occurred: {e}" | |
# Define Gradio interface | |
def subtitle_video(video_file, language_name): | |
try: | |
# Handle both file-like objects and file paths | |
return generate_subtitles(video_file, language_name) | |
except Exception as e: | |
print(f"Error in processing video: {e}") | |
return f"Error in processing video: {e}" | |
# Gradio app layout | |
interface = gr.Interface( | |
fn=subtitle_video, | |
inputs=[ | |
gr.Video(label="Upload Video"), | |
gr.Dropdown( # Dropdown for language selection | |
label="Choose Target Language", | |
choices=list(languages.keys()), # Display language names in the dropdown | |
value="Persian (fa)" # Default language | |
) | |
], | |
outputs="text", | |
title="Automatic Video Subtitler & Translator" | |
) | |
interface.launch() | |