DAI_Project / app.py
younes21000's picture
Update app.py
73c9093 verified
raw
history blame
3.86 kB
import gradio as gr
import moviepy.editor as mp
import librosa
import numpy as np
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
import tempfile
# Load Whisper model for speech-to-text
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large")
# MarianMT or M2M100 for translation (multi-language)
translator = pipeline("translation", model="facebook/m2m100_418M")
# Supported languages with their codes
languages = {
"Persian (fa)": "fa",
"French (fr)": "fr",
"Spanish (es)": "es",
"German (de)": "de",
"Chinese (zh)": "zh",
"Arabic (ar)": "ar",
"Hindi (hi)": "hi",
"Russian (ru)": "ru"
}
def transcribe_audio(chunk):
"""Transcribe a single audio chunk."""
return asr(chunk)["text"]
def generate_subtitles(video_file, language_name):
try:
# Extract the target language code from the selected language name
target_language = languages[language_name]
# Check if video_file is a file object or a file path string
if isinstance(video_file, str):
video_path = video_file # It's a file path
else:
video_path = video_file.name # It's a file object
print(f"Processing video from path: {video_path}")
# Load the video and extract audio directly
video = mp.VideoFileClip(video_path)
audio = video.audio
# Use a temporary file to hold the audio data
with tempfile.NamedTemporaryFile(delete=True) as tmp_audio_file:
audio.write_audiofile(tmp_audio_file.name, codec='pcm_s16le')
print("Starting speech-to-text transcription")
# Load the audio file as a waveform using librosa
waveform, sr = librosa.load(tmp_audio_file.name, sr=16000) # sr=16000 for Whisper
# Process audio in chunks
chunk_duration = 15 # seconds
chunk_size = sr * chunk_duration # number of samples per chunk
chunks = [waveform[i:i + chunk_size] for i in range(0, len(waveform), chunk_size) if len(waveform[i:i + chunk_size]) > 0]
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
transcriptions = list(executor.map(transcribe_audio, chunks))
# Combine all transcriptions into a single string
full_transcription = " ".join(transcriptions)
print("Starting translation")
# Translate transcription to the target language using M2M100
translated_subtitles = translator(
full_transcription,
src_lang="en", # Source language is English
tgt_lang=target_language # Target language from user selection
)[0]["translation_text"]
# Return subtitles
subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}"
return subtitles
except Exception as e:
# Catch and log the error
print(f"Error occurred: {e}")
return f"Error occurred: {e}"
# Define Gradio interface
def subtitle_video(video_file, language_name):
try:
# Handle both file-like objects and file paths
return generate_subtitles(video_file, language_name)
except Exception as e:
print(f"Error in processing video: {e}")
return f"Error in processing video: {e}"
# Gradio app layout
interface = gr.Interface(
fn=subtitle_video,
inputs=[
gr.Video(label="Upload Video"),
gr.Dropdown( # Dropdown for language selection
label="Choose Target Language",
choices=list(languages.keys()), # Display language names in the dropdown
value="Persian (fa)" # Default language
)
],
outputs="text",
title="Automatic Video Subtitler & Translator"
)
interface.launch()