Spaces:
Sleeping
Sleeping
File size: 3,663 Bytes
c9612c1 a5ff31f 5ebcbb6 c9612c1 38fcd18 a037c67 0e83a05 a037c67 0e83a05 a037c67 0e83a05 a5ff31f 0e83a05 a5ff31f 0e83a05 a037c67 a5ff31f 5ebcbb6 c9612c1 0e83a05 c9612c1 0e83a05 5ebcbb6 0e83a05 c9612c1 0e83a05 5ebcbb6 0e83a05 c9612c1 38fcd18 0e83a05 a037c67 73e4f43 0e83a05 79fc358 c9612c1 35575b2 79fc358 c9612c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
import moviepy.editor as mp
import librosa
import numpy as np
from transformers import pipeline
# Load Whisper model for speech-to-text
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large")
# MarianMT or M2M100 for translation (multi-language)
translator = pipeline("translation", model="facebook/m2m100_418M")
# Supported languages with their codes
languages = {
"Persian (fa)": "fa",
"French (fr)": "fr",
"Spanish (es)": "es",
"German (de)": "de",
"Chinese (zh)": "zh",
"Arabic (ar)": "ar",
"Hindi (hi)": "hi",
"Russian (ru)": "ru"
}
def generate_subtitles(video_file, language_name):
try:
# Extract the target language code from the selected language name
target_language = languages[language_name]
# Check if video_file is a file object or a file path string
if isinstance(video_file, str):
video_path = video_file # It's a file path
else:
video_path = video_file.name # It's a file object
print(f"Processing video from path: {video_path}")
# Extract audio from video using moviepy
video = mp.VideoFileClip(video_path)
audio_path = "temp_audio.wav"
audio = video.audio
audio.write_audiofile(audio_path, codec='pcm_s16le')
print("Starting speech-to-text transcription")
# Load the audio file as a waveform using librosa
waveform, sr = librosa.load(audio_path, sr=16000) # sr=16000 for Whisper
# Process audio in chunks
chunk_duration = 30 # seconds
chunk_size = sr * chunk_duration # number of samples per chunk
transcriptions = []
for i in range(0, len(waveform), chunk_size):
chunk = waveform[i:i + chunk_size]
if len(chunk) == 0:
break # Avoid processing empty chunks
# Pass the chunk to Whisper's ASR model
transcription = asr(chunk)["text"]
transcriptions.append(transcription)
# Combine all transcriptions into a single string
full_transcription = " ".join(transcriptions)
print("Starting translation")
# Translate transcription to the target language using M2M100
translation_pipeline = pipeline('translation', model='facebook/m2m100_418M')
translated_subtitles = translation_pipeline(
full_transcription,
forced_bos_token_id=translation_pipeline.tokenizer.get_lang_id(target_language)
)[0]["translation_text"]
# Return subtitles
subtitles = f"Original: {full_transcription}\nTranslated: {translated_subtitles}"
return subtitles
except Exception as e:
# Catch and log the error
print(f"Error occurred: {e}")
return f"Error occurred: {e}"
# Define Gradio interface
def subtitle_video(video_file, language_name):
try:
# Handle both file-like objects and file paths
return generate_subtitles(video_file, language_name)
except Exception as e:
print(f"Error in processing video: {e}")
return f"Error in processing video: {e}"
# Gradio app layout
interface = gr.Interface(
fn=subtitle_video,
inputs=[
gr.Video(label="Upload Video"),
gr.Dropdown( # Dropdown for language selection
label="Choose Target Language",
choices=list(languages.keys()), # Display language names in the dropdown
value="Persian (fa)" # Default language
)
],
outputs="text",
title="Automatic Video Subtitler & Translator"
)
interface.launch()
|