File size: 3,258 Bytes
3e9df0c
6ec98d8
 
 
 
 
 
3e9df0c
6ec98d8
 
c55986d
6ec98d8
 
 
 
 
 
c55986d
6ec98d8
b9c32e6
6ec98d8
 
 
 
 
 
 
 
 
 
 
 
b9c32e6
6ec98d8
 
b9c32e6
 
 
 
6ec98d8
 
 
 
 
 
b9c32e6
6ec98d8
 
 
 
b9c32e6
6ec98d8
 
b9c32e6
 
 
 
6ec98d8
 
 
b9c32e6
 
6ec98d8
 
 
b9c32e6
6ec98d8
 
b9c32e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe3c2bc
858ea72
6ec98d8
 
f52a14b
5847e9e
 
c62d4d9
b9c32e6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
import torch
import numpy as np
from PIL import Image
import open3d as o3d
from pathlib import Path

feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

def process_image(image_path):
    image_path = Path(image_path)
    image_raw = Image.open(image_path)
    image = image_raw.resize(
        (800, int(800 * image_raw.size[1] / image_raw.size[0])),
        Image.Resampling.LANCZOS)

    encoding = feature_extractor(image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**encoding)
        predicted_depth = outputs.predicted_depth

    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=image.size[::-1],
        mode="bicubic",
        align_corners=False,
    ).squeeze()
    output = prediction.cpu().numpy()
    depth_image = (output * 255 / np.max(output)).astype('uint8')
    
    try:
        gltf_path = create_3d_obj(np.array(image), depth_image, image_path)
    except Exception:
        gltf_path = create_3d_obj(np.array(image), depth_image, image_path, depth=8)
    
    return [Image.fromarray(depth_image), gltf_path, gltf_path]

def create_3d_obj(rgb_image, depth_image, image_path, depth=10):
    depth_o3d = o3d.geometry.Image(depth_image)
    image_o3d = o3d.geometry.Image(rgb_image)
    rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(
        image_o3d, depth_o3d, convert_rgb_to_intensity=False)
    w, h = depth_image.shape[1], depth_image.shape[0]

    camera_intrinsic = o3d.camera.PinholeCameraIntrinsic()
    camera_intrinsic.set_intrinsics(w, h, 500, 500, w/2, h/2)

    pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, camera_intrinsic)
    pcd.estimate_normals(
        search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.01, max_nn=30))
    pcd.orient_normals_towards_camera_location(camera_location=np.array([0., 0., 1000.]))
    
    with o3d.utility.VerbosityContextManager(o3d.utility.VerbosityLevel.Debug):
        mesh_raw, _ = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
            pcd, depth=depth, width=0, scale=1.1, linear_fit=True)

    voxel_size = max(mesh_raw.get_max_bound() - mesh_raw.get_min_bound()) / 256
    mesh = mesh_raw.simplify_vertex_clustering(voxel_size=voxel_size)
    
    bbox = pcd.get_axis_aligned_bounding_box()
    mesh_crop = mesh.crop(bbox)
    gltf_path = f'./{image_path.stem}.gltf'
    o3d.io.write_triangle_mesh(gltf_path, mesh_crop, write_triangle_uvs=True)
    return gltf_path

title = "Zero-shot Depth Estimation with DPT + 3D Point Cloud"
description = "DPT model predicts depth from an image, followed by 3D Point Cloud reconstruction."

iface = gr.Interface(
    fn=process_image,
    inputs=[gr.Image(type="filepath", label="Input Image")],
    outputs=[
        gr.Image(label="Predicted Depth", type="pil"),
        gr.Model3D(label="3D Mesh Reconstruction", clear_color=[1.0, 1.0, 1.0, 1.0]),
        gr.File(label="3D gLTF")
    ],
    title=title,
    description=description,
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()