cubuvl commited on
Commit
d375c2b
·
verified ·
1 Parent(s): 58fa719

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +385 -0
app.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from PIL import Image
4
+ from moviepy.editor import VideoFileClip, AudioFileClip
5
+
6
+ import os
7
+ from openai import OpenAI
8
+ import subprocess
9
+ from pathlib import Path
10
+ import uuid
11
+ import tempfile
12
+ import shlex
13
+ import shutil
14
+
15
+ # Supported models configuration
16
+ MODELS = {
17
+ "deepseek-ai/DeepSeek-V3": {
18
+ "base_url": "https://api.deepseek.com/v1",
19
+ "env_key": "DEEPSEEK_API_KEY",
20
+ },
21
+ "Qwen/Qwen2.5-Coder-32B-Instruct": {
22
+ "base_url": "https://api-inference.huggingface.co/v1/",
23
+ "env_key": "HF_TOKEN",
24
+ },
25
+ }
26
+
27
+ # Initialize client with first available model
28
+ client = OpenAI(
29
+ base_url=next(iter(MODELS.values()))["base_url"],
30
+ api_key=os.environ[next(iter(MODELS.values()))["env_key"]],
31
+ )
32
+
33
+ allowed_medias = [
34
+ ".png",
35
+ ".jpg",
36
+ ".webp",
37
+ ".jpeg",
38
+ ".tiff",
39
+ ".bmp",
40
+ ".gif",
41
+ ".svg",
42
+ ".mp3",
43
+ ".wav",
44
+ ".ogg",
45
+ ".mp4",
46
+ ".avi",
47
+ ".mov",
48
+ ".mkv",
49
+ ".flv",
50
+ ".wmv",
51
+ ".webm",
52
+ ".mpg",
53
+ ".mpeg",
54
+ ".m4v",
55
+ ".3gp",
56
+ ".3g2",
57
+ ".3gpp",
58
+ ]
59
+
60
+
61
+ def get_files_infos(files):
62
+ results = []
63
+ for file in files:
64
+ file_path = Path(file.name)
65
+ info = {}
66
+ info["size"] = os.path.getsize(file_path)
67
+ # Sanitize filename by replacing spaces with underscores
68
+ info["name"] = file_path.name.replace(" ", "_")
69
+ file_extension = file_path.suffix
70
+
71
+ if file_extension in (".mp4", ".avi", ".mkv", ".mov"):
72
+ info["type"] = "video"
73
+ video = VideoFileClip(file.name)
74
+ info["duration"] = video.duration
75
+ info["dimensions"] = "{}x{}".format(video.size[0], video.size[1])
76
+ if video.audio:
77
+ info["type"] = "video/audio"
78
+ info["audio_channels"] = video.audio.nchannels
79
+ video.close()
80
+ elif file_extension in (".mp3", ".wav"):
81
+ info["type"] = "audio"
82
+ audio = AudioFileClip(file.name)
83
+ info["duration"] = audio.duration
84
+ info["audio_channels"] = audio.nchannels
85
+ audio.close()
86
+ elif file_extension in (
87
+ ".png",
88
+ ".jpg",
89
+ ".jpeg",
90
+ ".tiff",
91
+ ".bmp",
92
+ ".gif",
93
+ ".svg",
94
+ ):
95
+ info["type"] = "image"
96
+ img = Image.open(file.name)
97
+ info["dimensions"] = "{}x{}".format(img.size[0], img.size[1])
98
+ results.append(info)
99
+ return results
100
+
101
+
102
+ def get_completion(prompt, files_info, top_p, temperature, model_choice):
103
+ # Create table header
104
+ files_info_string = "| Type | Name | Dimensions | Duration | Audio Channels |\n"
105
+ files_info_string += "|------|------|------------|-----------|--------|\n"
106
+
107
+ # Add each file as a table row
108
+ for file_info in files_info:
109
+ dimensions = file_info.get("dimensions", "-")
110
+ duration = (
111
+ f"{file_info.get('duration', '-')}s" if "duration" in file_info else "-"
112
+ )
113
+ audio = (
114
+ f"{file_info.get('audio_channels', '-')} channels"
115
+ if "audio_channels" in file_info
116
+ else "-"
117
+ )
118
+
119
+ files_info_string += f"| {file_info['type']} | {file_info['name']} | {dimensions} | {duration} | {audio} |\n"
120
+
121
+ messages = [
122
+ {
123
+ "role": "system",
124
+ "content": """
125
+ You are a very experienced media engineer, controlling a UNIX terminal.
126
+ You are an FFMPEG expert with years of experience and multiple contributions to the FFMPEG project.
127
+ You are given:
128
+ (1) a set of video, audio and/or image assets. Including their name, duration, dimensions and file size
129
+ (2) the description of a new video you need to create from the list of assets
130
+ Your objective is to generate the SIMPLEST POSSIBLE single ffmpeg command to create the requested video.
131
+ Key requirements:
132
+ - Use the absolute minimum number of ffmpeg options needed
133
+ - Avoid complex filter chains or filter_complex if possible
134
+ - Prefer simple concatenation, scaling, and basic filters
135
+ - Output exactly ONE command that will be directly pasted into the terminal
136
+ - Never output multiple commands chained together
137
+ - Output the command in a single line (no line breaks or multiple lines)
138
+ - If the user asks for waveform visualization make sure to set the mode to `line` with and the use the full width of the video. Also concatenate the audio into a single channel.
139
+ - For image sequences: Use -framerate and pattern matching (like 'img%d.jpg') when possible, falling back to individual image processing with -loop 1 and appropriate filters only when necessary.
140
+ - When showing file operations or commands, always use explicit paths and filenames without wildcards - avoid using asterisk (*) or glob patterns. Instead, use specific numbered sequences (like %d), explicit file lists, or show the full filename.
141
+ Remember: Simpler is better. Only use advanced ffmpeg features if absolutely necessary for the requested output.
142
+ """,
143
+ },
144
+ {
145
+ "role": "user",
146
+ "content": f"""Always output the media as video/mp4 and output file with "output.mp4". Provide only the shell command without any explanations.
147
+ The current assets and objective follow. Reply with the FFMPEG command:
148
+ AVAILABLE ASSETS LIST:
149
+ {files_info_string}
150
+ OBJECTIVE: {prompt} and output at "output.mp4"
151
+ YOUR FFMPEG COMMAND:
152
+ """,
153
+ },
154
+ ]
155
+ try:
156
+ # Print the complete prompt
157
+ print("\n=== COMPLETE PROMPT ===")
158
+ for msg in messages:
159
+ print(f"\n[{msg['role'].upper()}]:")
160
+ print(msg["content"])
161
+ print("=====================\n")
162
+
163
+ if model_choice not in MODELS:
164
+ raise ValueError(f"Model {model_choice} is not supported")
165
+
166
+ model_config = MODELS[model_choice]
167
+ client.base_url = model_config["base_url"]
168
+ client.api_key = os.environ[model_config["env_key"]]
169
+ model = "deepseek-chat" if "deepseek" in model_choice.lower() else model_choice
170
+
171
+ completion = client.chat.completions.create(
172
+ model=model,
173
+ messages=messages,
174
+ temperature=temperature,
175
+ top_p=top_p,
176
+ max_tokens=2048,
177
+ )
178
+ content = completion.choices[0].message.content
179
+ # Extract command from code block if present
180
+ if "```" in content:
181
+ # Find content between ```sh or ```bash and the next ```
182
+ import re
183
+
184
+ command = re.search(r"```(?:sh|bash)?\n(.*?)\n```", content, re.DOTALL)
185
+ if command:
186
+ command = command.group(1).strip()
187
+ else:
188
+ command = content.replace("\n", "")
189
+ else:
190
+ command = content.replace("\n", "")
191
+
192
+ # remove output.mp4 with the actual output file path
193
+ command = command.replace("output.mp4", "")
194
+
195
+ return command
196
+ except Exception as e:
197
+ raise Exception("API Error")
198
+
199
+
200
+ def update(
201
+ files,
202
+ prompt,
203
+ top_p=1,
204
+ temperature=1,
205
+ model_choice="Qwen/Qwen2.5-Coder-32B-Instruct",
206
+ ):
207
+ if prompt == "":
208
+ raise gr.Error("Please enter a prompt.")
209
+
210
+ files_info = get_files_infos(files)
211
+ # disable this if you're running the app locally or on your own server
212
+ for file_info in files_info:
213
+ if file_info["type"] == "video":
214
+ if file_info["duration"] > 120:
215
+ raise gr.Error(
216
+ "Please make sure all videos are less than 2 minute long."
217
+ )
218
+ if file_info["size"] > 100000000:
219
+ raise gr.Error("Please make sure all files are less than 100MB in size.")
220
+
221
+ attempts = 0
222
+ while attempts < 2:
223
+ print("ATTEMPT", attempts)
224
+ try:
225
+ command_string = get_completion(
226
+ prompt, files_info, top_p, temperature, model_choice
227
+ )
228
+ print(
229
+ f"""///PROMTP {prompt} \n\n/// START OF COMMAND ///:\n\n{command_string}\n\n/// END OF COMMAND ///\n\n"""
230
+ )
231
+
232
+ # split command string into list of arguments
233
+ args = shlex.split(command_string)
234
+ if args[0] != "ffmpeg":
235
+ raise Exception("Command does not start with ffmpeg")
236
+ temp_dir = tempfile.mkdtemp()
237
+ # copy files to temp dir with sanitized names
238
+ for file in files:
239
+ file_path = Path(file.name)
240
+ sanitized_name = file_path.name.replace(" ", "_")
241
+ shutil.copy(file_path, Path(temp_dir) / sanitized_name)
242
+
243
+ # test if ffmpeg command is valid dry run
244
+ ffmpg_dry_run = subprocess.run(
245
+ args + ["-f", "null", "-"],
246
+ stderr=subprocess.PIPE,
247
+ text=True,
248
+ cwd=temp_dir,
249
+ )
250
+ if ffmpg_dry_run.returncode == 0:
251
+ print("Command is valid.")
252
+ else:
253
+ print("Command is not valid. Error output:")
254
+ print(ffmpg_dry_run.stderr)
255
+ raise Exception(
256
+ "FFMPEG generated command is not valid. Please try something else."
257
+ )
258
+
259
+ output_file_name = f"output_{uuid.uuid4()}.mp4"
260
+ output_file_path = str((Path(temp_dir) / output_file_name).resolve())
261
+ final_command = args + ["-y", output_file_path]
262
+ print(
263
+ f"\n=== EXECUTING FFMPEG COMMAND ===\nffmpeg {' '.join(final_command[1:])}\n"
264
+ )
265
+ subprocess.run(final_command, cwd=temp_dir)
266
+ generated_command = f"### Generated Command\n```bash\nffmpeg {' '.join(args[1:])} -y output.mp4\n```"
267
+ return output_file_path, gr.update(value=generated_command)
268
+ except Exception as e:
269
+ attempts += 1
270
+ if attempts >= 2:
271
+ print("FROM UPDATE", e)
272
+ raise gr.Error(e)
273
+
274
+
275
+ with gr.Blocks() as demo:
276
+ gr.Markdown(
277
+ """
278
+ # 🏞 AI Video Composer
279
+ Compose new videos from your assets using natural language. Add video, image and audio assets and let [Qwen2.5-Coder](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) or [DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base) generate a new video for you (using FFMPEG).
280
+ """,
281
+ elem_id="header",
282
+ )
283
+ with gr.Row():
284
+ with gr.Column():
285
+ user_files = gr.File(
286
+ file_count="multiple",
287
+ label="Media files",
288
+ file_types=allowed_medias,
289
+ )
290
+ user_prompt = gr.Textbox(
291
+ placeholder="eg: Remove the 3 first seconds of the video",
292
+ label="Instructions",
293
+ )
294
+ btn = gr.Button("Run")
295
+ with gr.Accordion("Parameters", open=False):
296
+ model_choice = gr.Radio(
297
+ choices=list(MODELS.keys()),
298
+ value=list(MODELS.keys())[0],
299
+ label="Model",
300
+ )
301
+ top_p = gr.Slider(
302
+ minimum=-0,
303
+ maximum=1.0,
304
+ value=0.7,
305
+ step=0.05,
306
+ interactive=True,
307
+ label="Top-p (nucleus sampling)",
308
+ )
309
+ temperature = gr.Slider(
310
+ minimum=-0,
311
+ maximum=5.0,
312
+ value=0.1,
313
+ step=0.1,
314
+ interactive=True,
315
+ label="Temperature",
316
+ )
317
+ with gr.Column():
318
+ generated_video = gr.Video(
319
+ interactive=False, label="Generated Video", include_audio=True
320
+ )
321
+ generated_command = gr.Markdown()
322
+
323
+ btn.click(
324
+ fn=update,
325
+ inputs=[user_files, user_prompt, top_p, temperature, model_choice],
326
+ outputs=[generated_video, generated_command],
327
+ )
328
+ with gr.Row():
329
+ gr.Examples(
330
+ examples=[
331
+ [
332
+ ["./examples/ai_talk.wav", "./examples/bg-image.png"],
333
+ "Use the image as the background with a waveform visualization for the audio positioned in center of the video.",
334
+ 0.7,
335
+ 0.1,
336
+ (
337
+ list(MODELS.keys())[1]
338
+ if len(MODELS) > 1
339
+ else list(MODELS.keys())[0]
340
+ ),
341
+ ],
342
+ [
343
+ ["./examples/ai_talk.wav", "./examples/bg-image.png"],
344
+ "Use the image as the background with a waveform visualization for the audio positioned in center of the video. Make sure the waveform has a max height of 250 pixels.",
345
+ 0.7,
346
+ 0.1,
347
+ list(MODELS.keys())[0],
348
+ ],
349
+ [
350
+ [
351
+ "./examples/cat1.jpeg",
352
+ "./examples/cat2.jpeg",
353
+ "./examples/cat3.jpeg",
354
+ "./examples/cat4.jpeg",
355
+ "./examples/cat5.jpeg",
356
+ "./examples/cat6.jpeg",
357
+ "./examples/heat-wave.mp3",
358
+ ],
359
+ "Create a 3x2 grid of the cat images with the audio as background music. Make the video duration match the audio duration.",
360
+ 0.7,
361
+ 0.1,
362
+ (
363
+ list(MODELS.keys())[1]
364
+ if len(MODELS) > 1
365
+ else list(MODELS.keys())[0]
366
+ ),
367
+ ],
368
+ ],
369
+ inputs=[user_files, user_prompt, top_p, temperature, model_choice],
370
+ outputs=[generated_video, generated_command],
371
+ fn=update,
372
+ run_on_click=True,
373
+ cache_examples=False,
374
+ )
375
+
376
+ with gr.Row():
377
+ gr.Markdown(
378
+ """
379
+ If you have idea to improve this please open a PR:
380
+ [![Open a Pull Request](https://huggingface.co/datasets/huggingface/badges/raw/main/open-a-pr-lg-light.svg)](https://huggingface.co/spaces/huggingface-projects/video-composer-gpt4/discussions)
381
+ """,
382
+ )
383
+
384
+ demo.queue(default_concurrency_limit=200)
385
+ demo.launch(show_api=False, ssr_mode=False)