VEO3-RealTime

Running on Zero

App Files Files Community

seawolf2357 commited on Jun 19

Commit

a34249d

verified ·

1 Parent(s): 374f68b

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -28

app.py CHANGED Viewed

@@ -68,7 +68,7 @@ T2V_CINEMATIC_PROMPT = \
     '''1. For overly concise user inputs, reasonably infer and add details to make the video more complete and appealing without altering the original intent;\n''' \
     '''2. Enhance the main features in user descriptions (e.g., appearance, expression, quantity, race, posture, etc.), visual style, spatial relationships, and shot scales;\n''' \
     '''3. Output the entire prompt in English, retaining original text in quotes and titles, and preserving key input information;\n''' \
-    '''4. Prompts should match the user’s intent and accurately reflect the specified style. If the user does not specify a style, choose the most appropriate style for the video;\n''' \
     '''5. Emphasize motion information and different camera movements present in the input description;\n''' \
     '''6. Your output should have natural motion attributes. For the target category described, add natural actions of the target using simple and direct verbs;\n''' \
     '''7. The revised prompt should be around 80-100 words long.\n''' \
@@ -273,19 +273,23 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15):
         conditional_dict[key] = value.to(dtype=torch.float16)
     rnd = torch.Generator(gpu).manual_seed(int(seed))
     pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
     pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
-    noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
     vae_cache, latents_cache = None, None
     if not APP_STATE["current_use_taehv"] and not args.trt:
         vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
-    num_blocks = 7
     current_start_frame = 0
     all_num_frames = [pipeline.num_frame_per_block] * num_blocks
     total_frames_yielded = 0
     # Ensure temp directory exists
     os.makedirs("gradio_tmp", exist_ok=True)
@@ -352,6 +356,7 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15):
             frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
             all_frames_from_block.append(frame_np)
             total_frames_yielded += 1
             # Yield status update for each frame (cute tracking!)
@@ -375,7 +380,7 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15):
             )
             # Yield None for video but update status (frame-by-frame tracking)
-            yield None, frame_status_html
         # Encode entire block as one chunk immediately
         if all_frames_from_block:
@@ -392,7 +397,7 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15):
                 total_progress = (idx + 1) / num_blocks * 100
                 # Yield the actual video chunk
-                yield ts_path, gr.update()
             except Exception as e:
                 print(f"⚠️ Error encoding block {idx}: {e}")
@@ -400,31 +405,82 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15):
                 traceback.print_exc()
         current_start_frame += current_num_frames
     # Final completion status
-    final_status_html = (
-        f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
-        f"  <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
-        f"    <span style='font-size: 24px; margin-right: 12px;'>🎉</span>"
-        f"    <h4 style='margin: 0; color: #0f5132; font-size: 18px;'>Stream Complete!</h4>"
-        f"  </div>"
-        f"  <div style='background: rgba(255,255,255,0.7); padding: 8px; border-radius: 4px;'>"
-        f"    <p style='margin: 0; color: #0f5132; font-weight: 500;'>"
-        f"      📊 Generated {total_frames_yielded} frames across {num_blocks} blocks"
-        f"    </p>"
-        f"    <p style='margin: 4px 0 0 0; color: #0f5132; font-size: 14px;'>"
-        f"      🎬 Playback: {fps} FPS • 📁 Format: MPEG-TS/H.264"
-        f"    </p>"
-        f"  </div>"
-        f"</div>"
-    )
-    yield None, final_status_html
-    print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
 # --- Gradio UI Layout ---
 with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
-    gr.Markdown("# 🚀 Self-Forcing Video Generation")
-    gr.Markdown("Real-time video generation with distilled Wan2-1 1.3B [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
     with gr.Row():
         with gr.Column(scale=2):
@@ -471,12 +527,13 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
             gr.Markdown("### 📺 Video Stream")
             streaming_video = gr.Video(
-                label="Live Stream",
                 streaming=True,
                 loop=True,
                 height=400,
                 autoplay=True,
-                show_label=False
             )
             status_display = gr.HTML(
@@ -488,12 +545,18 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
                 ),
                 label="Generation Status"
             )
     # Connect the generator to the streaming video
     start_btn.click(
         fn=video_generation_handler_streaming,
         inputs=[prompt, seed, fps],
-        outputs=[streaming_video, status_display]
     )
     enhance_button.click(

     '''1. For overly concise user inputs, reasonably infer and add details to make the video more complete and appealing without altering the original intent;\n''' \
     '''2. Enhance the main features in user descriptions (e.g., appearance, expression, quantity, race, posture, etc.), visual style, spatial relationships, and shot scales;\n''' \
     '''3. Output the entire prompt in English, retaining original text in quotes and titles, and preserving key input information;\n''' \
+    '''4. Prompts should match the user's intent and accurately reflect the specified style. If the user does not specify a style, choose the most appropriate style for the video;\n''' \
     '''5. Emphasize motion information and different camera movements present in the input description;\n''' \
     '''6. Your output should have natural motion attributes. For the target category described, add natural actions of the target using simple and direct verbs;\n''' \
     '''7. The revised prompt should be around 80-100 words long.\n''' \
         conditional_dict[key] = value.to(dtype=torch.float16)
     rnd = torch.Generator(gpu).manual_seed(int(seed))
+    # KV 캐시 초기화
     pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
     pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
+    # 5.5초 영상을 위해 노이즈 텐서 크기 증가 (21 -> 24)
+    noise = torch.randn([1, 24, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
     vae_cache, latents_cache = None, None
     if not APP_STATE["current_use_taehv"] and not args.trt:
         vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
+    num_blocks = 8  # 7 -> 8로 증가하여 약 5.5초 영상 생성
     current_start_frame = 0
     all_num_frames = [pipeline.num_frame_per_block] * num_blocks
     total_frames_yielded = 0
+    all_frames_for_download = []  # 다운로드용 전체 프레임 저장
     # Ensure temp directory exists
     os.makedirs("gradio_tmp", exist_ok=True)
             frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
             all_frames_from_block.append(frame_np)
+            all_frames_for_download.append(frame_np)  # 다운로드용 프레임 저장
             total_frames_yielded += 1
             # Yield status update for each frame (cute tracking!)
             )
             # Yield None for video but update status (frame-by-frame tracking)
+            yield None, frame_status_html, gr.update()
         # Encode entire block as one chunk immediately
         if all_frames_from_block:
                 total_progress = (idx + 1) / num_blocks * 100
                 # Yield the actual video chunk
+                yield ts_path, gr.update(), gr.update()
             except Exception as e:
                 print(f"⚠️ Error encoding block {idx}: {e}")
                 traceback.print_exc()
         current_start_frame += current_num_frames
+        # 메모리 효율성을 위한 GPU 캐시 정리
+        if idx < num_blocks - 1 and idx % 2 == 1:  # 2블록마다 캐시 정리
+            torch.cuda.empty_cache()
     # Final completion status
+    video_duration = total_frames_yielded / fps
+    # 전체 비디오를 MP4로 저장
+    if all_frames_for_download:
+        output_filename = f"generated_video_{int(time.time())}_{seed}.mp4"
+        output_path = os.path.join("gradio_tmp", output_filename)
+        print(f"💾 Saving complete video to {output_path}")
+        # MP4 컨테이너로 저장
+        container = av.open(output_path, mode='w')
+        stream = container.add_stream('h264', rate=fps)
+        stream.width = all_frames_for_download[0].shape[1]
+        stream.height = all_frames_for_download[0].shape[0]
+        stream.pix_fmt = 'yuv420p'
+        stream.options = {
+            'crf': '23',
+            'preset': 'medium'
+        }
+        for frame_np in all_frames_for_download:
+            frame = av.VideoFrame.from_ndarray(frame_np, format='rgb24')
+            frame = frame.reformat(format=stream.pix_fmt)
+            for packet in stream.encode(frame):
+                container.mux(packet)
+        for packet in stream.encode():
+            container.mux(packet)
+        container.close()
+        # 파일 크기 계산
+        file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
+        final_status_html = (
+            f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
+            f"  <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
+            f"    <span style='font-size: 24px; margin-right: 12px;'>🎉</span>"
+            f"    <h4 style='margin: 0; color: #0f5132; font-size: 18px;'>Video Generation Complete!</h4>"
+            f"  </div>"
+            f"  <div style='background: rgba(255,255,255,0.7); padding: 12px; border-radius: 4px;'>"
+            f"    <p style='margin: 0 0 8px 0; color: #0f5132; font-weight: 500;'>"
+            f"      📊 Generated {total_frames_yielded} frames across {num_blocks} blocks ({video_duration:.1f} seconds)"
+            f"    </p>"
+            f"    <p style='margin: 0; color: #0f5132; font-size: 14px;'>"
+            f"      🎬 Resolution: {all_frames_for_download[0].shape[1]}x{all_frames_for_download[0].shape[0]} • FPS: {fps} • Size: {file_size_mb:.1f} MB"
+            f"    </p>"
+            f"    <p style='margin: 8px 0 0 0; color: #0f5132; font-size: 13px; font-style: italic;'>"
+            f"      💾 Click the download button below to save your video!"
+            f"    </p>"
+            f"  </div>"
+            f"</div>"
+        )
+        # 최종 비디오 파일 경로도 함께 반환
+        yield output_path, final_status_html, gr.update(value=output_path, visible=True)
+    else:
+        final_status_html = (
+            f"<div style='padding: 16px; border: 1px solid #dc3545; background: #f8d7da; border-radius: 8px;'>"
+            f"  <h4 style='margin: 0; color: #721c24;'>⚠️ No frames were generated</h4>"
+            f"</div>"
+        )
+        yield None, final_status_html, gr.update()
+    print(f"✅ Video generation complete! {total_frames_yielded} frames ({video_duration:.1f} seconds)")
 # --- Gradio UI Layout ---
 with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
+    gr.Markdown("# 🚀 Self-Forcing Video Generation (6-second)")
+    gr.Markdown("Real-time 6-second video generation with distilled Wan2-1 1.3B [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("### 📺 Video Stream")
             streaming_video = gr.Video(
+                label="Live Stream & Download",
                 streaming=True,
                 loop=True,
                 height=400,
                 autoplay=True,
+                show_label=True,
+                show_download_button=True  # 다운로드 버튼 활성화
             )
             status_display = gr.HTML(
                 ),
                 label="Generation Status"
             )
+            # 다운로드용 파일 출력
+            download_file = gr.File(
+                label="📥 Download Video",
+                visible=False
+            )
     # Connect the generator to the streaming video
     start_btn.click(
         fn=video_generation_handler_streaming,
         inputs=[prompt, seed, fps],
+        outputs=[streaming_video, status_display, download_file]
     )
     enhance_button.click(