DAI_Project / app.py
younes21000's picture
Update app.py
58f4eed verified
raw
history blame
3.89 kB
import gradio as gr
import moviepy.editor as mp
import librosa
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
import tempfile
# Load Whisper model for speech-to-text
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large")
# MarianMT or M2M100 for translation (multi-language)
translator = pipeline("translation", model="facebook/m2m100_418M")
# Supported languages with their codes
languages = {
"Persian (fa)": "fa",
"French (fr)": "fr",
"Spanish (es)": "es",
"German (de)": "de",
"Chinese (zh)": "zh",
"Arabic (ar)": "ar",
"Hindi (hi)": "hi",
"Russian (ru)": "ru"
}
def transcribe_audio(chunk):
"""Transcribe a single audio chunk."""
return asr(chunk)["text"]
def generate_subtitles(video_file, language_name):
try:
# Extract the target language code from the selected language name
target_language = languages[language_name]
# Check if video_file is a file object or a file path string
if isinstance(video_file, str):
video_path = video_file # It's a file path
else:
video_path = video_file.name # It's a file object
print(f"Processing video from path: {video_path}")
# Load the video and extract audio directly
video = mp.VideoFileClip(video_path)
audio = video.audio
# Use a temporary file to hold the audio data
with tempfile.NamedTemporaryFile(delete=True, suffix='.wav') as tmp_audio_file:
audio.write_audiofile(tmp_audio_file.name, codec='pcm_s16le') # Specify codec as pcm_s16le
print("Starting speech-to-text transcription")
# Load the audio file as a waveform using librosa
waveform, sr = librosa.load(tmp_audio_file.name, sr=16000) # sr=16000 for Whisper
# Process audio in chunks
chunk_duration = 15 # seconds
chunk_size = sr * chunk_duration # number of samples per chunk
chunks = [waveform[i:i + chunk_size] for i in range(0, len(waveform), chunk_size) if len(waveform[i:i + chunk_size]) > 0]
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
transcriptions = list(executor.map(transcribe_audio, chunks))
# Combine all transcriptions into a single string
full_transcription = " ".join(transcriptions)
print("Starting translation")
# Translate transcription to the target language using M2M100
translated_subtitles = translator(
full_transcription,
src_lang="en", # Source language is English
tgt_lang=target_language # Target language from user selection
)[0]["translation_text"]
# Return subtitles
subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}"
return subtitles
except Exception as e:
# Catch and log the error
print(f"Error occurred: {e}")
return f"Error occurred: {e}"
# Define Gradio interface
def subtitle_video(video_file, language_name):
try:
# Handle both file-like objects and file paths
return generate_subtitles(video_file, language_name)
except Exception as e:
print(f"Error in processing video: {e}")
return f"Error in processing video: {e}"
# Gradio app layout
interface = gr.Interface(
fn=subtitle_video,
inputs=[
gr.Video(label="Upload Video"),
gr.Dropdown( # Dropdown for language selection
label="Choose Target Language",
choices=list(languages.keys()), # Display language names in the dropdown
value="Persian (fa)" # Default language
)
],
outputs="text",
title="Automatic Video Subtitler & Translator"
)
interface.launch()