Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import subprocess
|
3 |
import os
|
@@ -5,56 +15,139 @@ import shutil
|
|
5 |
import uuid
|
6 |
from transformers import pipeline
|
7 |
from gtts import gTTS
|
|
|
8 |
|
|
|
|
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
try:
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
if not os.path.exists(audio_path):
|
16 |
-
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
23 |
)
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
# 4.
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
|
35 |
tts.save(translated_audio_path)
|
36 |
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
|
|
|
|
|
|
|
|
39 |
except Exception as e:
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
# Create the Gradio interface
|
44 |
iface = gr.Interface(
|
45 |
fn=translate_video,
|
46 |
-
inputs=gr.Video(label="Upload
|
47 |
outputs=[
|
48 |
-
gr.
|
49 |
-
gr.
|
|
|
|
|
50 |
gr.Video(label="Original Video"),
|
51 |
],
|
52 |
-
title="
|
53 |
-
description="
|
54 |
allow_flagging="never",
|
|
|
|
|
|
|
|
|
|
|
55 |
)
|
56 |
|
57 |
if __name__ == "__main__":
|
58 |
-
if not os.path.exists("downloads"):
|
59 |
-
os.makedirs("downloads")
|
60 |
iface.launch()
|
|
|
1 |
+
#
|
2 |
+
# ----- Prerequisites -----
|
3 |
+
# 1. Install required Python libraries:
|
4 |
+
# pip install gradio transformers torch gtts langdetect
|
5 |
+
#
|
6 |
+
# 2. Install ffmpeg on your system.
|
7 |
+
# - (Mac) brew install ffmpeg
|
8 |
+
# - (Ubuntu) sudo apt install ffmpeg
|
9 |
+
# - (Windows) choco install ffmpeg
|
10 |
+
#
|
11 |
import gradio as gr
|
12 |
import subprocess
|
13 |
import os
|
|
|
15 |
import uuid
|
16 |
from transformers import pipeline
|
17 |
from gtts import gTTS
|
18 |
+
from langdetect import detect, DetectorFactory
|
19 |
|
20 |
+
# Ensure deterministic language detection results
|
21 |
+
DetectorFactory.seed = 0
|
22 |
|
23 |
+
# --- 1. Load the model only once ---
|
24 |
+
# This is more efficient as it won't reload the model on every function call.
|
25 |
+
print("Loading Whisper model, this may take a moment...")
|
26 |
+
try:
|
27 |
+
asr_pipeline = pipeline(
|
28 |
+
"automatic-speech-recognition",
|
29 |
+
model="openai/whisper-tiny", # Using tiny for speed, can be changed to base, small, etc.
|
30 |
+
device="cpu" # Use "cuda:0" if you have a GPU and torch with CUDA
|
31 |
+
)
|
32 |
+
print("Whisper model loaded successfully.")
|
33 |
+
except Exception as e:
|
34 |
+
print(f"Error loading model: {e}")
|
35 |
+
# Exit or handle the error appropriately if the model is critical
|
36 |
+
asr_pipeline = None
|
37 |
+
|
38 |
+
def translate_video(video_path):
|
39 |
+
"""
|
40 |
+
Translates the audio of a video file to English and provides detailed output.
|
41 |
+
"""
|
42 |
+
if not asr_pipeline:
|
43 |
+
gr.Warning("The speech recognition model is not available. The application cannot proceed.")
|
44 |
+
return "Model not loaded.", None, None, None, None
|
45 |
+
|
46 |
+
# Create a unique temporary directory for this run
|
47 |
+
temp_dir = f"temp_{uuid.uuid4()}"
|
48 |
+
os.makedirs(temp_dir, exist_ok=True)
|
49 |
|
50 |
try:
|
51 |
+
gr.Info("Step 1/5: Extracting audio from video...")
|
52 |
+
audio_path = os.path.join(temp_dir, "audio.wav")
|
53 |
+
|
54 |
+
# Use ffmpeg to extract audio. -y overwrites existing files. -i is input.
|
55 |
+
# -vn disables video recording. -acodec pcm_s16le is standard for .wav
|
56 |
+
# -ar 16000 is the sample rate Whisper expects.
|
57 |
+
command = [
|
58 |
+
"ffmpeg", "-i", video_path, "-y",
|
59 |
+
"-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
|
60 |
+
audio_path
|
61 |
+
]
|
62 |
+
subprocess.run(command, check=True, capture_output=True, text=True)
|
63 |
|
64 |
if not os.path.exists(audio_path):
|
65 |
+
raise FileNotFoundError("Audio extraction failed. ffmpeg did not produce an audio file.")
|
66 |
|
67 |
+
# --- 2. Transcribe the original audio to text ---
|
68 |
+
gr.Info("Step 2/5: Transcribing original audio...")
|
69 |
+
transcription_result = asr_pipeline(
|
70 |
+
audio_path,
|
71 |
+
return_timestamps=False, # We don't need timestamps for the full transcript
|
72 |
+
generate_kwargs={"task": "transcribe"}
|
73 |
)
|
74 |
+
original_transcript = transcription_result["text"].strip()
|
75 |
+
|
76 |
+
if not original_transcript:
|
77 |
+
gr.Warning("No speech was detected in the video.")
|
78 |
+
return "No speech detected.", "N/A", "N/A", None, video_path
|
79 |
|
80 |
+
# --- 3. Detect the language of the original transcript ---
|
81 |
+
gr.Info("Step 3/5: Detecting language...")
|
82 |
+
try:
|
83 |
+
detected_language_code = detect(original_transcript)
|
84 |
+
# You can expand this with a dictionary for full language names if desired
|
85 |
+
# e.g., lang_map = {'es': 'Spanish', 'fr': 'French', ...}
|
86 |
+
except Exception:
|
87 |
+
detected_language_code = "Unknown"
|
88 |
|
89 |
+
# --- 4. Translate the audio into English ---
|
90 |
+
gr.Info("Step 4/5: Translating audio to English...")
|
91 |
+
translation_result = asr_pipeline(
|
92 |
+
audio_path,
|
93 |
+
return_timestamps=False,
|
94 |
+
generate_kwargs={"task": "translate", "language": "en"} # Explicitly translate to English
|
95 |
+
)
|
96 |
+
translated_text = translation_result["text"].strip()
|
97 |
+
|
98 |
+
# --- 5. Convert translated text to speech ---
|
99 |
+
gr.Info("Step 5/5: Generating translated audio...")
|
100 |
+
tts = gTTS(translated_text, lang='en')
|
101 |
translated_audio_path = os.path.join(temp_dir, "translated_audio.mp3")
|
102 |
tts.save(translated_audio_path)
|
103 |
|
104 |
+
# Create a detailed summary markdown
|
105 |
+
summary_markdown = f"""
|
106 |
+
## Translation Details
|
107 |
+
- **Detected Language**: `{detected_language_code}`
|
108 |
+
|
109 |
+
---
|
110 |
+
|
111 |
+
### Translated Text (English)
|
112 |
+
{translated_text}
|
113 |
+
"""
|
114 |
+
|
115 |
+
return summary_markdown, original_transcript, translated_text, translated_audio_path, video_path
|
116 |
|
117 |
+
except subprocess.CalledProcessError as e:
|
118 |
+
error_message = f"ffmpeg error: {e.stderr}"
|
119 |
+
gr.Warning(error_message)
|
120 |
+
return error_message, None, None, None, None
|
121 |
except Exception as e:
|
122 |
+
error_message = f"An unexpected error occurred: {str(e)}"
|
123 |
+
gr.Warning(error_message)
|
124 |
+
return error_message, None, None, None, None
|
125 |
+
finally:
|
126 |
+
# Clean up the temporary directory
|
127 |
+
if os.path.exists(temp_dir):
|
128 |
+
shutil.rmtree(temp_dir)
|
129 |
+
|
130 |
|
131 |
+
# --- Create the Gradio interface ---
|
132 |
iface = gr.Interface(
|
133 |
fn=translate_video,
|
134 |
+
inputs=gr.Video(label="Upload Your Video", sources=['upload']),
|
135 |
outputs=[
|
136 |
+
gr.Markdown(label="Summary"),
|
137 |
+
gr.Textbox(label="Original Transcript", interactive=False, lines=5),
|
138 |
+
gr.Textbox(label="Translated Text (English)", interactive=False, lines=5),
|
139 |
+
gr.Audio(label="Translated Audio (English)"),
|
140 |
gr.Video(label="Original Video"),
|
141 |
],
|
142 |
+
title="Enhanced Video Translator",
|
143 |
+
description="Upload a video to transcribe its audio, detect the language, and translate it to English. Provides original transcript, translated text, and translated audio.",
|
144 |
allow_flagging="never",
|
145 |
+
examples=[
|
146 |
+
# You can place video files in a folder named 'examples' next to your script
|
147 |
+
# and they will show up here.
|
148 |
+
# [os.path.join(os.path.dirname(__file__), "examples/example_video_1.mp4")],
|
149 |
+
]
|
150 |
)
|
151 |
|
152 |
if __name__ == "__main__":
|
|
|
|
|
153 |
iface.launch()
|