File size: 3,589 Bytes
a772610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Hugging Face Space: 2D to 3D Stereo Pair Generator using Depth + LaMa Inpainting

import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from transformers import DPTForDepthEstimation, DPTFeatureExtractor
import requests
import tempfile
import subprocess
import os

# === DEVICE ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === DEPTH MODEL ===
def load_depth_model():
    model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
    processor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
    return model, processor

@torch.no_grad()
def estimate_depth(image: Image.Image, model, processor):
    image = image.resize((384, 384))
    inputs = processor(images=image, return_tensors="pt").to(device)
    depth = model(**inputs).predicted_depth
    depth = torch.nn.functional.interpolate(
        depth.unsqueeze(1),
        size=image.size[::-1],
        mode="bicubic",
        align_corners=False,
    ).squeeze().detach().cpu().numpy()
    depth_min, depth_max = depth.min(), depth.max()
    return (depth - depth_min) / (depth_max - depth_min)

def depth_to_disparity(depth, max_disp=32):
    return (1.0 - depth) * max_disp

def generate_right_and_mask(image, disparity):
    h, w = image.shape[:2]
    right = np.zeros_like(image)
    mask = np.ones((h, w), dtype=np.uint8)

    for y in range(h):
        for x in range(w):
            d = int(round(disparity[y, x]))
            x_r = x - d
            if 0 <= x_r < w:
                right[y, x_r] = image[y, x]
                mask[y, x_r] = 0
    return right, mask

# === LAMA INPAINTING ===
LAMA_API = "https://huggingface.co/spaces/saic-mdal/lama-inpainting"

def run_lama_inpainting(image_bgr, mask):
    img = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
    mask_img = Image.fromarray(mask * 255).convert("RGB")

    # Save temporarily
    tmp_dir = tempfile.mkdtemp()
    img_path = os.path.join(tmp_dir, "input.png")
    mask_path = os.path.join(tmp_dir, "mask.png")
    img.save(img_path)
    mask_img.save(mask_path)

    # Use Hugging Face's API-compatible request
    files = {"image": open(img_path, "rb"), "mask": open(mask_path, "rb")}
    response = requests.post(f"{LAMA_API}/run/predict", files=files)
    if response.status_code == 200:
        result = Image.open(requests.get(response.json()["data"][0]["name"], stream=True).raw)
        return cv2.cvtColor(np.array(result), cv2.COLOR_RGB2BGR)
    else:
        raise Exception("LAMA inpainting failed")

# === APP LOGIC ===
depth_model, depth_processor = load_depth_model()

def stereo_pipeline(image_pil):
    image = image_pil.convert("RGB")
    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    depth = estimate_depth(image, depth_model, depth_processor)
    disparity = depth_to_disparity(depth)
    right_img, mask = generate_right_and_mask(image_cv, disparity)
    right_filled = run_lama_inpainting(right_img, mask)

    left = image_pil
    right = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB))
    return left, right

# === GRADIO UI ===
demo = gr.Interface(
    fn=stereo_pipeline,
    inputs=gr.Image(type="pil", label="Upload 2D Image"),
    outputs=[
        gr.Image(label="Left Eye (Original)"),
        gr.Image(label="Right Eye (AI Generated)")
    ],
    title="2D to 3D Stereo Generator with LaMa Inpainting",
    description="Generates a stereo pair from a 2D image using depth estimation and LaMa AI inpainting to handle occluded pixels in the right-eye view."
)

demo.launch()