Spaces:

SpyC0der77
/

AI-Video-Stabilization

Running

File size: 9,181 Bytes

import cv2
import numpy as np
import csv
import math
import torch
import tempfile
import os
import gradio as gr

# Load the RAFT model from torch.hub (uses the 'raft_small' variant)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = torch.hub.load("princeton-vl/RAFT", "raft_small", pretrained=True)
model = model.to(device)
model.eval()

def generate_motion_csv(video_file, output_csv=None):
    """
    Uses the RAFT model to compute optical flow between consecutive frames,
    then writes a CSV file (with columns: frame, mag, ang, zoom) where:
      - mag: median magnitude of the flow,
      - ang: median angle (in degrees), and
      - zoom: fraction of pixels moving away from the image center.
      
    Args:
        video_file (str): Path to the input video.
        output_csv (str): Optional path for output CSV file. If None, a temporary file is used.
    
    Returns:
        output_csv (str): Path to the generated CSV file.
    """
    if output_csv is None:
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
        output_csv = temp_file.name
        temp_file.close()
    
    cap = cv2.VideoCapture(video_file)
    if not cap.isOpened():
        raise ValueError("Could not open video file for CSV generation.")
    
    # Prepare CSV file for writing
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['frame', 'mag', 'ang', 'zoom']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        ret, prev_frame = cap.read()
        if not ret:
            raise ValueError("Cannot read first frame from video.")
        
        # Convert the first frame to tensor
        prev_frame_rgb = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2RGB)
        prev_tensor = torch.from_numpy(prev_frame_rgb).permute(2,0,1).float().unsqueeze(0) / 255.0
        prev_tensor = prev_tensor.to(device)
        
        frame_idx = 1
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            curr_frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            curr_tensor = torch.from_numpy(curr_frame_rgb).permute(2,0,1).float().unsqueeze(0) / 255.0
            curr_tensor = curr_tensor.to(device)
            
            # Use RAFT to compute optical flow between previous and current frame.
            with torch.no_grad():
                # The RAFT model returns a low-resolution flow and an upsampled (high-res) flow.
                flow_low, flow_up = model(prev_tensor, curr_tensor, iters=20, test_mode=True)
            # Convert flow to numpy array (shape: H x W x 2)
            flow = flow_up[0].permute(1,2,0).cpu().numpy()
            
            # Compute median magnitude and angle for the optical flow
            mag, ang = cv2.cartToPolar(flow[...,0], flow[...,1], angleInDegrees=True)
            median_mag = np.median(mag)
            median_ang = np.median(ang)
            
            # Compute a "zoom factor": fraction of pixels moving away from the center.
            h, w = flow.shape[:2]
            center_x, center_y = w / 2, h / 2
            x_coords, y_coords = np.meshgrid(np.arange(w), np.arange(h))
            x_offset = x_coords - center_x
            y_offset = y_coords - center_y
            # Dot product between flow vectors and pixel offsets:
            dot = flow[...,0] * x_offset + flow[...,1] * y_offset
            zoom_factor = np.count_nonzero(dot > 0) / (w * h)
            
            # Write the computed metrics to the CSV file.
            writer.writerow({
                'frame': frame_idx,
                'mag': median_mag,
                'ang': median_ang,
                'zoom': zoom_factor
            })
            
            # Update for next iteration
            prev_tensor = curr_tensor.clone()
            frame_idx += 1
    
    cap.release()
    print(f"Motion CSV generated: {output_csv}")
    return output_csv

def read_motion_csv(csv_filename):
    """
    Reads the CSV file (columns: frame, mag, ang, zoom) and computes a cumulative
    offset per frame to be used for stabilization.
    
    Returns:
        A dictionary mapping frame numbers to (dx, dy) offsets (the negative cumulative displacement).
    """
    motion_data = {}
    cumulative_dx = 0.0
    cumulative_dy = 0.0
    with open(csv_filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            frame_num = int(row['frame'])
            mag = float(row['mag'])
            ang = float(row['ang'])
            # Convert angle (in degrees) to radians.
            rad = math.radians(ang)
            dx = mag * math.cos(rad)
            dy = mag * math.sin(rad)
            cumulative_dx += dx
            cumulative_dy += dy
            # Negative cumulative offset counteracts the detected motion.
            motion_data[frame_num] = (-cumulative_dx, -cumulative_dy)
    return motion_data

def stabilize_video_using_csv(video_file, csv_file, zoom=1.0, output_file=None):
    """
    Stabilizes the input video using motion data from the CSV file.
    
    Args:
        video_file (str): Path to the input video.
        csv_file (str): Path to the motion CSV file.
        zoom (float): Zoom factor to apply before stabilization (default: 1.0, no zoom).
        output_file (str): Path for the output stabilized video. If None, a temporary file is created.
    
    Returns:
        output_file (str): Path to the stabilized video file.
    """
    # Read motion data from CSV
    motion_data = read_motion_csv(csv_file)
    
    cap = cv2.VideoCapture(video_file)
    if not cap.isOpened():
        raise ValueError("Could not open video file for stabilization.")
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    if output_file is None:
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
        output_file = temp_file.name
        temp_file.close()
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
    
    frame_num = 1
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Optionally apply zoom (resize and center-crop)
        if zoom != 1.0:
            zoomed_frame = cv2.resize(frame, None, fx=zoom, fy=zoom, interpolation=cv2.INTER_LINEAR)
            zoomed_h, zoomed_w = zoomed_frame.shape[:2]
            start_x = max((zoomed_w - width) // 2, 0)
            start_y = max((zoomed_h - height) // 2, 0)
            frame = zoomed_frame[start_y:start_y+height, start_x:start_x+width]
        
        # Get the stabilization offset for the current frame (default to (0,0) if not available)
        dx, dy = motion_data.get(frame_num, (0, 0))
        
        # Apply an affine transformation to counteract the motion.
        transform = np.array([[1, 0, dx],
                              [0, 1, dy]], dtype=np.float32)
        stabilized_frame = cv2.warpAffine(frame, transform, (width, height))
        
        out.write(stabilized_frame)
        frame_num += 1
    
    cap.release()
    out.release()
    print(f"Stabilized video saved to: {output_file}")
    return output_file

def process_video_ai(video_file, zoom):
    """
    Gradio interface function: Given an input video and a zoom factor,
    it uses a deep learning model (RAFT) to generate motion data (video.flow.csv)
    and then stabilizes the video based on that data.
    
    Returns:
        A tuple containing the original video file path and the stabilized video file path.
    """
    # Ensure the input is a file path (if provided as a dict, extract the "name")
    if isinstance(video_file, dict):
        video_file = video_file.get("name", None)
    if video_file is None:
        raise ValueError("Please upload a video file.")
    
    # Generate motion CSV using AI-based optical flow (RAFT)
    csv_file = generate_motion_csv(video_file)
    # Stabilize the video using the generated CSV data
    stabilized_path = stabilize_video_using_csv(video_file, csv_file, zoom=zoom)
    return video_file, stabilized_path

# Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# AI-Powered Video Stabilization")
    gr.Markdown("Upload a video and select a zoom factor. The system will automatically use a deep learning model (RAFT) to generate motion data and then stabilize the video.")
    
    with gr.Row():
        with gr.Column():
            video_input = gr.Video(label="Input Video")
            zoom_slider = gr.Slider(minimum=1.0, maximum=2.0, step=0.1, value=1.0, label="Zoom Factor")
            process_button = gr.Button("Process Video")
        with gr.Column():
            original_video = gr.Video(label="Original Video")
            stabilized_video = gr.Video(label="Stabilized Video")
    
    process_button.click(
        fn=process_video_ai,
        inputs=[video_input, zoom_slider],
        outputs=[original_video, stabilized_video]
    )

demo.launch()