Spaces:
Sleeping
Sleeping
File size: 3,864 Bytes
c9612c1 a5ff31f 9c779fd c9612c1 e70c3e8 73c9093 c9612c1 38fcd18 e70c3e8 a037c67 0e83a05 a037c67 0e83a05 a037c67 9c779fd 0e83a05 73c9093 a037c67 73c9093 5ebcbb6 73c9093 5ebcbb6 c9612c1 0e83a05 c9612c1 0e83a05 e70c3e8 5ebcbb6 bd05d7b 0e83a05 c9612c1 0e83a05 5ebcbb6 0e83a05 c9612c1 38fcd18 0e83a05 a037c67 73e4f43 0e83a05 79fc358 c9612c1 35575b2 79fc358 c9612c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
import moviepy.editor as mp
import librosa
import numpy as np
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
import tempfile
# Load Whisper model for speech-to-text
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large")
# MarianMT or M2M100 for translation (multi-language)
translator = pipeline("translation", model="facebook/m2m100_418M")
# Supported languages with their codes
languages = {
"Persian (fa)": "fa",
"French (fr)": "fr",
"Spanish (es)": "es",
"German (de)": "de",
"Chinese (zh)": "zh",
"Arabic (ar)": "ar",
"Hindi (hi)": "hi",
"Russian (ru)": "ru"
}
def transcribe_audio(chunk):
"""Transcribe a single audio chunk."""
return asr(chunk)["text"]
def generate_subtitles(video_file, language_name):
try:
# Extract the target language code from the selected language name
target_language = languages[language_name]
# Check if video_file is a file object or a file path string
if isinstance(video_file, str):
video_path = video_file # It's a file path
else:
video_path = video_file.name # It's a file object
print(f"Processing video from path: {video_path}")
# Load the video and extract audio directly
video = mp.VideoFileClip(video_path)
audio = video.audio
# Use a temporary file to hold the audio data
with tempfile.NamedTemporaryFile(delete=True) as tmp_audio_file:
audio.write_audiofile(tmp_audio_file.name, codec='pcm_s16le')
print("Starting speech-to-text transcription")
# Load the audio file as a waveform using librosa
waveform, sr = librosa.load(tmp_audio_file.name, sr=16000) # sr=16000 for Whisper
# Process audio in chunks
chunk_duration = 15 # seconds
chunk_size = sr * chunk_duration # number of samples per chunk
chunks = [waveform[i:i + chunk_size] for i in range(0, len(waveform), chunk_size) if len(waveform[i:i + chunk_size]) > 0]
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
transcriptions = list(executor.map(transcribe_audio, chunks))
# Combine all transcriptions into a single string
full_transcription = " ".join(transcriptions)
print("Starting translation")
# Translate transcription to the target language using M2M100
translated_subtitles = translator(
full_transcription,
src_lang="en", # Source language is English
tgt_lang=target_language # Target language from user selection
)[0]["translation_text"]
# Return subtitles
subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}"
return subtitles
except Exception as e:
# Catch and log the error
print(f"Error occurred: {e}")
return f"Error occurred: {e}"
# Define Gradio interface
def subtitle_video(video_file, language_name):
try:
# Handle both file-like objects and file paths
return generate_subtitles(video_file, language_name)
except Exception as e:
print(f"Error in processing video: {e}")
return f"Error in processing video: {e}"
# Gradio app layout
interface = gr.Interface(
fn=subtitle_video,
inputs=[
gr.Video(label="Upload Video"),
gr.Dropdown( # Dropdown for language selection
label="Choose Target Language",
choices=list(languages.keys()), # Display language names in the dropdown
value="Persian (fa)" # Default language
)
],
outputs="text",
title="Automatic Video Subtitler & Translator"
)
interface.launch()
|