Spaces:

SpyC0der77
/

AI-Video-Stabilization

Running

App Files Files Community

SpyC0der77 commited on Mar 15

Commit

89bc003

verified ·

1 Parent(s): 7561365

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -60

app.py CHANGED Viewed

@@ -7,24 +7,32 @@ import tempfile
 import os
 import gradio as gr
-# Load the RAFT model from torch.hub (uses the 'raft_small' variant)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
-model = torch.hub.load("princeton-vl/RAFT", "raft_small", pretrained=True)
-model = model.to(device)
-model.eval()
 def generate_motion_csv(video_file, output_csv=None):
     """
-    Uses the RAFT model to compute optical flow between consecutive frames,
-    then writes a CSV file (with columns: frame, mag, ang, zoom) where:
-      - mag: median magnitude of the flow,
-      - ang: median angle (in degrees), and
-      - zoom: fraction of pixels moving away from the image center.
     Args:
         video_file (str): Path to the input video.
-        output_csv (str): Optional path for output CSV file. If None, a temporary file is used.
     Returns:
         output_csv (str): Path to the generated CSV file.
@@ -38,40 +46,46 @@ def generate_motion_csv(video_file, output_csv=None):
     if not cap.isOpened():
         raise ValueError("Could not open video file for CSV generation.")
-    # Prepare CSV file for writing
     with open(output_csv, 'w', newline='') as csvfile:
         fieldnames = ['frame', 'mag', 'ang', 'zoom']
         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
         writer.writeheader()
-        ret, prev_frame = cap.read()
         if not ret:
             raise ValueError("Cannot read first frame from video.")
-        # Convert the first frame to tensor
-        prev_frame_rgb = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2RGB)
-        prev_tensor = torch.from_numpy(prev_frame_rgb).permute(2,0,1).float().unsqueeze(0) / 255.0
-        prev_tensor = prev_tensor.to(device)
         frame_idx = 1
         while True:
             ret, frame = cap.read()
             if not ret:
                 break
-            curr_frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            curr_tensor = torch.from_numpy(curr_frame_rgb).permute(2,0,1).float().unsqueeze(0) / 255.0
-            curr_tensor = curr_tensor.to(device)
-            # Use RAFT to compute optical flow between previous and current frame.
-            with torch.no_grad():
-                # The RAFT model returns a low-resolution flow and an upsampled (high-res) flow.
-                flow_low, flow_up = model(prev_tensor, curr_tensor, iters=20, test_mode=True)
-            # Convert flow to numpy array (shape: H x W x 2)
-            flow = flow_up[0].permute(1,2,0).cpu().numpy()
-            # Compute median magnitude and angle for the optical flow
-            mag, ang = cv2.cartToPolar(flow[...,0], flow[...,1], angleInDegrees=True)
             median_mag = np.median(mag)
             median_ang = np.median(ang)
@@ -81,11 +95,9 @@ def generate_motion_csv(video_file, output_csv=None):
             x_coords, y_coords = np.meshgrid(np.arange(w), np.arange(h))
             x_offset = x_coords - center_x
             y_offset = y_coords - center_y
-            # Dot product between flow vectors and pixel offsets:
-            dot = flow[...,0] * x_offset + flow[...,1] * y_offset
             zoom_factor = np.count_nonzero(dot > 0) / (w * h)
-            # Write the computed metrics to the CSV file.
             writer.writerow({
                 'frame': frame_idx,
                 'mag': median_mag,
@@ -93,21 +105,19 @@ def generate_motion_csv(video_file, output_csv=None):
                 'zoom': zoom_factor
             })
-            # Update for next iteration
-            prev_tensor = curr_tensor.clone()
             frame_idx += 1
     cap.release()
     print(f"Motion CSV generated: {output_csv}")
     return output_csv
 def read_motion_csv(csv_filename):
     """
-    Reads the CSV file (columns: frame, mag, ang, zoom) and computes a cumulative
-    offset per frame to be used for stabilization.
     Returns:
-        A dictionary mapping frame numbers to (dx, dy) offsets (the negative cumulative displacement).
     """
     motion_data = {}
     cumulative_dx = 0.0
@@ -118,13 +128,11 @@ def read_motion_csv(csv_filename):
             frame_num = int(row['frame'])
             mag = float(row['mag'])
             ang = float(row['ang'])
-            # Convert angle (in degrees) to radians.
             rad = math.radians(ang)
             dx = mag * math.cos(rad)
             dy = mag * math.sin(rad)
             cumulative_dx += dx
             cumulative_dy += dy
-            # Negative cumulative offset counteracts the detected motion.
             motion_data[frame_num] = (-cumulative_dx, -cumulative_dy)
     return motion_data
@@ -135,13 +143,12 @@ def stabilize_video_using_csv(video_file, csv_file, zoom=1.0, output_file=None):
     Args:
         video_file (str): Path to the input video.
         csv_file (str): Path to the motion CSV file.
-        zoom (float): Zoom factor to apply before stabilization (default: 1.0, no zoom).
         output_file (str): Path for the output stabilized video. If None, a temporary file is created.
     Returns:
         output_file (str): Path to the stabilized video file.
     """
-    # Read motion data from CSV
     motion_data = read_motion_csv(csv_file)
     cap = cv2.VideoCapture(video_file)
@@ -160,13 +167,12 @@ def stabilize_video_using_csv(video_file, csv_file, zoom=1.0, output_file=None):
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
-    frame_num = 1
     while True:
         ret, frame = cap.read()
         if not ret:
             break
-        # Optionally apply zoom (resize and center-crop)
         if zoom != 1.0:
             zoomed_frame = cv2.resize(frame, None, fx=zoom, fy=zoom, interpolation=cv2.INTER_LINEAR)
             zoomed_h, zoomed_w = zoomed_frame.shape[:2]
@@ -174,16 +180,13 @@ def stabilize_video_using_csv(video_file, csv_file, zoom=1.0, output_file=None):
             start_y = max((zoomed_h - height) // 2, 0)
             frame = zoomed_frame[start_y:start_y+height, start_x:start_x+width]
-        # Get the stabilization offset for the current frame (default to (0,0) if not available)
-        dx, dy = motion_data.get(frame_num, (0, 0))
-        # Apply an affine transformation to counteract the motion.
         transform = np.array([[1, 0, dx],
                               [0, 1, dy]], dtype=np.float32)
         stabilized_frame = cv2.warpAffine(frame, transform, (width, height))
         out.write(stabilized_frame)
-        frame_num += 1
     cap.release()
     out.release()
@@ -192,29 +195,28 @@ def stabilize_video_using_csv(video_file, csv_file, zoom=1.0, output_file=None):
 def process_video_ai(video_file, zoom):
     """
-    Gradio interface function: Given an input video and a zoom factor,
-    it uses a deep learning model (RAFT) to generate motion data (video.flow.csv)
-    and then stabilizes the video based on that data.
     Returns:
-        A tuple containing the original video file path and the stabilized video file path.
     """
-    # Ensure the input is a file path (if provided as a dict, extract the "name")
     if isinstance(video_file, dict):
         video_file = video_file.get("name", None)
     if video_file is None:
         raise ValueError("Please upload a video file.")
-    # Generate motion CSV using AI-based optical flow (RAFT)
     csv_file = generate_motion_csv(video_file)
-    # Stabilize the video using the generated CSV data
     stabilized_path = stabilize_video_using_csv(video_file, csv_file, zoom=zoom)
     return video_file, stabilized_path
-# Build the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# AI-Powered Video Stabilization")
-    gr.Markdown("Upload a video and select a zoom factor. The system will automatically use a deep learning model (RAFT) to generate motion data and then stabilize the video.")
     with gr.Row():
         with gr.Column():

 import os
 import gradio as gr
+# Set up device for torch
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
+# Try to load the RAFT model from torch.hub.
+# If it fails (e.g. due to repository structure changes), we will fall back to OpenCV optical flow.
+try:
+    # The trust_repo parameter might prompt for confirmation; set it to True.
+    raft_model = torch.hub.load("princeton-vl/RAFT", "raft_small", pretrained=True, trust_repo=True)
+    raft_model = raft_model.to(device)
+    raft_model.eval()
+    print("RAFT model loaded successfully.")
+except Exception as e:
+    print("Error loading RAFT model:", e)
+    print("Falling back to OpenCV optical flow for motion CSV generation.")
+    raft_model = None
 def generate_motion_csv(video_file, output_csv=None):
     """
+    Generates a CSV file with motion data (columns: frame, mag, ang, zoom) from an input video.
+    If the RAFT model is available, it uses it to compute optical flow; otherwise, it falls back to
+    OpenCV's Farneback optical flow.
     Args:
         video_file (str): Path to the input video.
+        output_csv (str): Optional output CSV file path. If None, a temporary file is created.
     Returns:
         output_csv (str): Path to the generated CSV file.
     if not cap.isOpened():
         raise ValueError("Could not open video file for CSV generation.")
     with open(output_csv, 'w', newline='') as csvfile:
         fieldnames = ['frame', 'mag', 'ang', 'zoom']
         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
         writer.writeheader()
+        ret, first_frame = cap.read()
         if not ret:
             raise ValueError("Cannot read first frame from video.")
+        if raft_model is not None:
+            # Convert the first frame to RGB and then to a torch tensor.
+            first_frame_rgb = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
+            prev_tensor = torch.from_numpy(first_frame_rgb).permute(2, 0, 1).float().unsqueeze(0) / 255.0
+            prev_tensor = prev_tensor.to(device)
+        else:
+            prev_gray = cv2.cvtColor(first_frame, cv2.COLOR_BGR2GRAY)
         frame_idx = 1
         while True:
             ret, frame = cap.read()
             if not ret:
                 break
+            if raft_model is not None:
+                curr_frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                curr_tensor = torch.from_numpy(curr_frame_rgb).permute(2, 0, 1).float().unsqueeze(0) / 255.0
+                curr_tensor = curr_tensor.to(device)
+                with torch.no_grad():
+                    flow_low, flow_up = raft_model(prev_tensor, curr_tensor, iters=20, test_mode=True)
+                flow = flow_up[0].permute(1, 2, 0).cpu().numpy()
+                prev_tensor = curr_tensor.clone()
+            else:
+                curr_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None,
+                                                    pyr_scale=0.5, levels=3, winsize=15,
+                                                    iterations=3, poly_n=5, poly_sigma=1.2, flags=0)
+                prev_gray = curr_gray
+            # Compute median magnitude and angle of the optical flow.
+            mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True)
             median_mag = np.median(mag)
             median_ang = np.median(ang)
             x_coords, y_coords = np.meshgrid(np.arange(w), np.arange(h))
             x_offset = x_coords - center_x
             y_offset = y_coords - center_y
+            dot = flow[..., 0] * x_offset + flow[..., 1] * y_offset
             zoom_factor = np.count_nonzero(dot > 0) / (w * h)
             writer.writerow({
                 'frame': frame_idx,
                 'mag': median_mag,
                 'zoom': zoom_factor
             })
             frame_idx += 1
     cap.release()
     print(f"Motion CSV generated: {output_csv}")
     return output_csv
 def read_motion_csv(csv_filename):
     """
+    Reads a motion CSV file (with columns: frame, mag, ang, zoom) and computes a cumulative
+    offset per frame (the negative cumulative displacement) for stabilization.
     Returns:
+        A dictionary mapping frame numbers to (dx, dy) offsets.
     """
     motion_data = {}
     cumulative_dx = 0.0
             frame_num = int(row['frame'])
             mag = float(row['mag'])
             ang = float(row['ang'])
             rad = math.radians(ang)
             dx = mag * math.cos(rad)
             dy = mag * math.sin(rad)
             cumulative_dx += dx
             cumulative_dy += dy
             motion_data[frame_num] = (-cumulative_dx, -cumulative_dy)
     return motion_data
     Args:
         video_file (str): Path to the input video.
         csv_file (str): Path to the motion CSV file.
+        zoom (float): Zoom factor to apply before stabilization (default: 1.0).
         output_file (str): Path for the output stabilized video. If None, a temporary file is created.
     Returns:
         output_file (str): Path to the stabilized video file.
     """
     motion_data = read_motion_csv(csv_file)
     cap = cv2.VideoCapture(video_file)
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
+    frame_idx = 1
     while True:
         ret, frame = cap.read()
         if not ret:
             break
         if zoom != 1.0:
             zoomed_frame = cv2.resize(frame, None, fx=zoom, fy=zoom, interpolation=cv2.INTER_LINEAR)
             zoomed_h, zoomed_w = zoomed_frame.shape[:2]
             start_y = max((zoomed_h - height) // 2, 0)
             frame = zoomed_frame[start_y:start_y+height, start_x:start_x+width]
+        dx, dy = motion_data.get(frame_idx, (0, 0))
         transform = np.array([[1, 0, dx],
                               [0, 1, dy]], dtype=np.float32)
         stabilized_frame = cv2.warpAffine(frame, transform, (width, height))
         out.write(stabilized_frame)
+        frame_idx += 1
     cap.release()
     out.release()
 def process_video_ai(video_file, zoom):
     """
+    Gradio interface function:
+      - Generates motion data (CSV) from the input video using an AI model (RAFT, if available).
+      - Stabilizes the video based on the generated motion data.
     Returns:
+        Tuple containing the original video file path and the stabilized video file path.
     """
     if isinstance(video_file, dict):
         video_file = video_file.get("name", None)
     if video_file is None:
         raise ValueError("Please upload a video file.")
+    # Generate motion CSV using the AI model (or fallback) for optical flow.
     csv_file = generate_motion_csv(video_file)
+    # Stabilize the video using the generated CSV.
     stabilized_path = stabilize_video_using_csv(video_file, csv_file, zoom=zoom)
     return video_file, stabilized_path
+# Build the Gradio UI.
 with gr.Blocks() as demo:
     gr.Markdown("# AI-Powered Video Stabilization")
+    gr.Markdown("Upload a video and select a zoom factor. The system will automatically generate motion data (video.flow.csv) using an AI model (RAFT, if available) and then stabilize the video.")
     with gr.Row():
         with gr.Column():