File size: 3,118 Bytes
c9612c1
 
a5ff31f
c9612c1
 
 
 
 
 
 
 
38fcd18
 
 
 
 
 
 
 
 
 
 
 
a037c67
0e83a05
 
 
 
a037c67
 
 
 
 
0e83a05
a037c67
 
 
0e83a05
a5ff31f
0e83a05
a5ff31f
0e83a05
 
a037c67
a5ff31f
 
 
b2b888c
 
c9612c1
0e83a05
c9612c1
0e83a05
 
 
 
 
 
c9612c1
0e83a05
 
 
 
 
 
 
 
c9612c1
 
38fcd18
0e83a05
a037c67
73e4f43
0e83a05
 
 
79fc358
c9612c1
 
 
 
 
35575b2
 
79fc358
 
 
c9612c1
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import moviepy.editor as mp
import librosa
from transformers import pipeline

# Load Whisper model for speech-to-text
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large")

# MarianMT or M2M100 for translation (multi-language)
translator = pipeline("translation", model="facebook/m2m100_418M")

# Supported languages with their codes
languages = {
    "Persian (fa)": "fa",
    "French (fr)": "fr",
    "Spanish (es)": "es",
    "German (de)": "de",
    "Chinese (zh)": "zh",
    "Arabic (ar)": "ar",
    "Hindi (hi)": "hi",
    "Russian (ru)": "ru"
}

def generate_subtitles(video_file, language_name):
    try:
        # Extract the target language code from the selected language name
        target_language = languages[language_name]
        
        # Check if video_file is a file object or a file path string
        if isinstance(video_file, str):
            video_path = video_file  # It's a file path
        else:
            video_path = video_file.name  # It's a file object
        
        print(f"Processing video from path: {video_path}")

        # Extract audio from video using moviepy
        video = mp.VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        audio = video.audio
        audio.write_audiofile(audio_path, codec='pcm_s16le')

        print("Starting speech-to-text transcription")

        # Load the audio file as a waveform using librosa
        waveform, sr = librosa.load(audio_path, sr=16000)  # sr=16000 for Whisper

        # Pass the waveform (NumPy array) directly to Whisper's ASR model
        transcription = asr(waveform)["text"]

        print("Starting translation")

        # Translate transcription to the target language using M2M100
        translation_pipeline = pipeline('translation', model='facebook/m2m100_418M')
        translated_subtitles = translation_pipeline(
            transcription, 
            forced_bos_token_id=translation_pipeline.tokenizer.get_lang_id(target_language)
        )[0]["translation_text"]

        # Return subtitles
        subtitles = f"Original: {transcription}\nTranslated: {translated_subtitles}"
        return subtitles
    
    except Exception as e:
        # Catch and log the error
        print(f"Error occurred: {e}")
        return f"Error occurred: {e}"

# Define Gradio interface
def subtitle_video(video_file, language_name):
    try:
        # Handle both file-like objects and file paths
        return generate_subtitles(video_file, language_name)
    except Exception as e:
        print(f"Error in processing video: {e}")
        return f"Error in processing video: {e}"

# Gradio app layout
interface = gr.Interface(
    fn=subtitle_video,
    inputs=[
        gr.Video(label="Upload Video"),
        gr.Dropdown(  # Dropdown for language selection
            label="Choose Target Language",
            choices=list(languages.keys()),  # Display language names in the dropdown
            value="Persian (fa)"  # Default language
        )
    ],
    outputs="text",
    title="Automatic Video Subtitler & Translator"
)

interface.launch()