Spaces:
Runtime error
Runtime error
Delete app.py
Browse files
app.py
DELETED
@@ -1,226 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import uuid
|
3 |
-
import asyncio
|
4 |
-
import subprocess
|
5 |
-
import json
|
6 |
-
from zipfile import ZipFile
|
7 |
-
import stat
|
8 |
-
import gradio as gr
|
9 |
-
import ffmpeg
|
10 |
-
import cv2
|
11 |
-
import edge_tts
|
12 |
-
from googletrans import Translator
|
13 |
-
from huggingface_hub import HfApi
|
14 |
-
import moviepy.editor as mp
|
15 |
-
import spaces
|
16 |
-
|
17 |
-
# Constants and initialization
|
18 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
19 |
-
REPO_ID = "artificialguybr/video-dubbing"
|
20 |
-
MAX_VIDEO_DURATION = 180 # seconds
|
21 |
-
|
22 |
-
api = HfApi(token=HF_TOKEN)
|
23 |
-
|
24 |
-
# Extract and set permissions for ffmpeg
|
25 |
-
ZipFile("ffmpeg.zip").extractall()
|
26 |
-
st = os.stat('ffmpeg')
|
27 |
-
os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
|
28 |
-
|
29 |
-
language_mapping = {
|
30 |
-
'English': ('en', 'en-US-EricNeural'),
|
31 |
-
'Spanish': ('es', 'es-ES-AlvaroNeural'),
|
32 |
-
'French': ('fr', 'fr-FR-HenriNeural'),
|
33 |
-
'German': ('de', 'de-DE-ConradNeural'),
|
34 |
-
'Italian': ('it', 'it-IT-DiegoNeural'),
|
35 |
-
'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
|
36 |
-
'Polish': ('pl', 'pl-PL-MarekNeural'),
|
37 |
-
'Turkish': ('tr', 'tr-TR-AhmetNeural'),
|
38 |
-
'Russian': ('ru', 'ru-RU-DmitryNeural'),
|
39 |
-
'Dutch': ('nl', 'nl-NL-MaartenNeural'),
|
40 |
-
'Czech': ('cs', 'cs-CZ-AntoninNeural'),
|
41 |
-
'Arabic': ('ar', 'ar-SA-HamedNeural'),
|
42 |
-
'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
|
43 |
-
'Japanese': ('ja', 'ja-JP-KeitaNeural'),
|
44 |
-
'Korean': ('ko', 'ko-KR-InJoonNeural'),
|
45 |
-
'Hindi': ('hi', 'hi-IN-MadhurNeural'),
|
46 |
-
'Swedish': ('sv', 'sv-SE-MattiasNeural'),
|
47 |
-
'Danish': ('da', 'da-DK-JeppeNeural'),
|
48 |
-
'Finnish': ('fi', 'fi-FI-HarriNeural'),
|
49 |
-
'Greek': ('el', 'el-GR-NestorasNeural')
|
50 |
-
}
|
51 |
-
|
52 |
-
print("Starting the program...")
|
53 |
-
|
54 |
-
def generate_unique_filename(extension):
|
55 |
-
return f"{uuid.uuid4()}{extension}"
|
56 |
-
|
57 |
-
def cleanup_files(*files):
|
58 |
-
for file in files:
|
59 |
-
if file and os.path.exists(file):
|
60 |
-
os.remove(file)
|
61 |
-
print(f"Removed file: {file}")
|
62 |
-
|
63 |
-
@spaces.GPU(duration=90)
|
64 |
-
def transcribe_audio(file_path):
|
65 |
-
print(f"Starting transcription of file: {file_path}")
|
66 |
-
temp_audio = None
|
67 |
-
|
68 |
-
if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
|
69 |
-
print("Video file detected. Extracting audio...")
|
70 |
-
try:
|
71 |
-
video = mp.VideoFileClip(file_path)
|
72 |
-
temp_audio = generate_unique_filename(".wav")
|
73 |
-
video.audio.write_audiofile(temp_audio)
|
74 |
-
file_path = temp_audio
|
75 |
-
except Exception as e:
|
76 |
-
print(f"Error extracting audio from video: {e}")
|
77 |
-
raise
|
78 |
-
|
79 |
-
output_file = generate_unique_filename(".json")
|
80 |
-
command = [
|
81 |
-
"insanely-fast-whisper",
|
82 |
-
"--file-name", file_path,
|
83 |
-
"--device-id", "0",
|
84 |
-
"--model-name", "openai/whisper-large-v3",
|
85 |
-
"--task", "transcribe",
|
86 |
-
"--timestamp", "chunk",
|
87 |
-
"--transcript-path", output_file
|
88 |
-
]
|
89 |
-
|
90 |
-
try:
|
91 |
-
result = subprocess.run(command, check=True, capture_output=True, text=True)
|
92 |
-
print(f"Transcription output: {result.stdout}")
|
93 |
-
except subprocess.CalledProcessError as e:
|
94 |
-
print(f"Error running insanely-fast-whisper: {e}")
|
95 |
-
raise
|
96 |
-
|
97 |
-
try:
|
98 |
-
with open(output_file, "r") as f:
|
99 |
-
transcription = json.load(f)
|
100 |
-
except json.JSONDecodeError as e:
|
101 |
-
print(f"Error decoding JSON: {e}")
|
102 |
-
raise
|
103 |
-
|
104 |
-
result = transcription.get("text", " ".join([chunk["text"] for chunk in transcription.get("chunks", [])]))
|
105 |
-
|
106 |
-
cleanup_files(output_file, temp_audio)
|
107 |
-
|
108 |
-
return result
|
109 |
-
|
110 |
-
async def text_to_speech(text, voice, output_file):
|
111 |
-
communicate = edge_tts.Communicate(text, voice)
|
112 |
-
await communicate.save(output_file)
|
113 |
-
|
114 |
-
@spaces.GPU
|
115 |
-
def process_video(video, target_language, use_wav2lip):
|
116 |
-
try:
|
117 |
-
if target_language is None:
|
118 |
-
raise ValueError("Please select a Target Language for Dubbing.")
|
119 |
-
|
120 |
-
run_uuid = uuid.uuid4().hex[:6]
|
121 |
-
output_filename = f"{run_uuid}_resized_video.mp4"
|
122 |
-
ffmpeg.input(video).output(output_filename, vf='scale=-2:720').run()
|
123 |
-
|
124 |
-
video_path = output_filename
|
125 |
-
|
126 |
-
if not os.path.exists(video_path):
|
127 |
-
raise FileNotFoundError(f"Error: {video_path} does not exist.")
|
128 |
-
|
129 |
-
video_info = ffmpeg.probe(video_path)
|
130 |
-
video_duration = float(video_info['streams'][0]['duration'])
|
131 |
-
|
132 |
-
if video_duration > MAX_VIDEO_DURATION:
|
133 |
-
cleanup_files(video_path)
|
134 |
-
raise ValueError(f"Video duration exceeds {MAX_VIDEO_DURATION} seconds. Please upload a shorter video.")
|
135 |
-
|
136 |
-
ffmpeg.input(video_path).output(f"{run_uuid}_output_audio.wav", acodec='pcm_s24le', ar=48000, map='a').run()
|
137 |
-
|
138 |
-
subprocess.run(f"ffmpeg -y -i {run_uuid}_output_audio.wav -af lowpass=3000,highpass=100 {run_uuid}_output_audio_final.wav", shell=True, check=True)
|
139 |
-
|
140 |
-
whisper_text = transcribe_audio(f"{run_uuid}_output_audio_final.wav")
|
141 |
-
print(f"Transcription successful: {whisper_text}")
|
142 |
-
|
143 |
-
target_language_code, voice = language_mapping[target_language]
|
144 |
-
translator = Translator()
|
145 |
-
translated_text = translator.translate(whisper_text, dest=target_language_code).text
|
146 |
-
print(f"Translated text: {translated_text}")
|
147 |
-
|
148 |
-
asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
|
149 |
-
|
150 |
-
if use_wav2lip:
|
151 |
-
try:
|
152 |
-
subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
|
153 |
-
except subprocess.CalledProcessError as e:
|
154 |
-
print(f"Wav2Lip error: {str(e)}")
|
155 |
-
gr.Warning("Wav2lip encountered an error. Falling back to simple audio replacement.")
|
156 |
-
subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
|
157 |
-
else:
|
158 |
-
subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
|
159 |
-
|
160 |
-
output_video_path = f"{run_uuid}_output_video.mp4"
|
161 |
-
if not os.path.exists(output_video_path):
|
162 |
-
raise FileNotFoundError(f"Error: {output_video_path} was not generated.")
|
163 |
-
|
164 |
-
cleanup_files(
|
165 |
-
f"{run_uuid}_resized_video.mp4",
|
166 |
-
f"{run_uuid}_output_audio.wav",
|
167 |
-
f"{run_uuid}_output_audio_final.wav",
|
168 |
-
f"{run_uuid}_output_synth.wav"
|
169 |
-
)
|
170 |
-
|
171 |
-
return output_video_path, ""
|
172 |
-
|
173 |
-
except Exception as e:
|
174 |
-
print(f"Error in process_video: {str(e)}")
|
175 |
-
return None, f"Error: {str(e)}"
|
176 |
-
|
177 |
-
# Gradio interface setup
|
178 |
-
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
179 |
-
gr.Markdown("# AI Video Dubbing")
|
180 |
-
gr.Markdown("This tool uses AI to dub videos into different languages. Upload a video, choose a target language, and get a dubbed version!")
|
181 |
-
|
182 |
-
with gr.Row():
|
183 |
-
with gr.Column(scale=2):
|
184 |
-
video_input = gr.Video(label="Upload Video")
|
185 |
-
target_language = gr.Dropdown(
|
186 |
-
choices=list(language_mapping.keys()),
|
187 |
-
label="Target Language for Dubbing",
|
188 |
-
value="Spanish"
|
189 |
-
)
|
190 |
-
use_wav2lip = gr.Checkbox(
|
191 |
-
label="Use Wav2Lip for lip sync",
|
192 |
-
value=False,
|
193 |
-
info="Enable this if the video has close-up faces. May not work for all videos."
|
194 |
-
)
|
195 |
-
submit_button = gr.Button("Process Video", variant="primary")
|
196 |
-
|
197 |
-
with gr.Column(scale=2):
|
198 |
-
output_video = gr.Video(label="Processed Video")
|
199 |
-
error_message = gr.Textbox(label="Status/Error Message")
|
200 |
-
|
201 |
-
submit_button.click(
|
202 |
-
process_video,
|
203 |
-
inputs=[video_input, target_language, use_wav2lip],
|
204 |
-
outputs=[output_video, error_message]
|
205 |
-
)
|
206 |
-
|
207 |
-
gr.Markdown("""
|
208 |
-
## Notes:
|
209 |
-
- Video limit is 1 minute. The tool will dub all speakers using a single voice.
|
210 |
-
- Processing may take up to 5 minutes.
|
211 |
-
- This is an alpha version using open-source models.
|
212 |
-
- Quality vs. speed trade-off was made for scalability and hardware limitations.
|
213 |
-
- For videos longer than 1 minute, please duplicate this Space and adjust the limit in the code.
|
214 |
-
""")
|
215 |
-
|
216 |
-
gr.Markdown("""
|
217 |
-
---
|
218 |
-
Developed by [@artificialguybr](https://twitter.com/artificialguybr) using open-source tools.
|
219 |
-
Special thanks to Hugging Face for GPU support and [@yeswondwer](https://twitter.com/@yeswondwerr) for the original code.
|
220 |
-
|
221 |
-
Try our [Video Transcription and Translation](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) tool!
|
222 |
-
""")
|
223 |
-
|
224 |
-
print("Launching Gradio interface...")
|
225 |
-
demo.queue()
|
226 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|