Spaces:

LTT
/

DiMeR

Running on Zero

File size: 12,046 Bytes

import os
import sys
import logging
import time

__workdir__ = '/'.join(os.path.abspath(__file__).split('/')[:-2])
sys.path.insert(0, __workdir__)

print(__workdir__)

import numpy as np
import torch
from torchvision.transforms import v2
from PIL import Image
import rembg
import torch.nn.functional as F

from models.DiMeR.online_render.render_single import load_mipmap
from models.DiMeR.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
from models.DiMeR.utils.render_utils import rotate_x, rotate_y, rotate_z
from models.DiMeR.utils.mesh_util import save_obj, save_obj_with_mtl
from models.DiMeR.utils.infer_util import remove_background, resize_foreground

from models.ISOMER.reconstruction_func import reconstruction
from models.ISOMER.projection_func import projection

from utils.tool import NormalTransfer, get_render_cameras_frames, get_background, get_render_cameras_video, render_frames, mask_fix

logging.basicConfig(
    level = logging.INFO
)
logger = logging.getLogger('kiss3d_wrapper')

OUT_DIR = './outputs'
TMP_DIR = './outputs/tmp'

os.makedirs(TMP_DIR, exist_ok=True)

@torch.no_grad()
def lrm_reconstruct(model, infer_config, images, 
                    name='', export_texmap=False,
                    input_camera_type='zero123',
                    render_3d_bundle_image=True,
                    render_azimuths=[270, 0, 90, 180], 
                    render_elevations=[5, 5, 5, 5], 
                    render_radius=4.15):
    """
    image: Tensor, shape (1, c, h, w)
    """
    
    mesh_path_idx = os.path.join(TMP_DIR, f'{name}_recon_from_{input_camera_type}.obj')

    device = images.device
    if input_camera_type == 'zero123':
        input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
    elif input_camera_type == 'kiss3d':
        input_cameras = get_flux_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
    else:
        raise NotImplementedError(f'Unexpected input camera type: {input_camera_type}')
    
    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)

    logger.info(f"==> Runing LRM reconstruction ...")
    planes = model.forward_planes(images, input_cameras)
    mesh_out = model.extract_mesh(
            planes,
            use_texture_map=export_texmap,
            **infer_config,
        )
    if export_texmap:
        vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
        save_obj_with_mtl(
            vertices.data.cpu().numpy(),
            uvs.data.cpu().numpy(),
            faces.data.cpu().numpy(),
            mesh_tex_idx.data.cpu().numpy(),
            tex_map.permute(1, 2, 0).data.cpu().numpy(),
            mesh_path_idx,
        )
    else:
        vertices, faces, vertex_colors = mesh_out
        save_obj(vertices, faces, vertex_colors, mesh_path_idx)
    logger.info(f"Mesh saved to {mesh_path_idx}")

    if render_3d_bundle_image:
        assert render_azimuths is not None and render_elevations is not None and render_radius is not None
        render_azimuths = torch.Tensor(render_azimuths).to(device)
        render_elevations = torch.Tensor(render_elevations).to(device)
        
        render_size = infer_config.render_resolution
        ENV = load_mipmap("models/DiMeR/env_mipmap/6")
        materials = (0.0,0.9)
        all_mv, all_mvp, all_campos, identity_mv = get_render_cameras_frames(
            batch_size=1, 
            radius=render_radius, 
            azimuths=render_azimuths, 
            elevations=render_elevations,
            fov=30
        )
        frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
            model, 
            planes, 
            render_cameras=all_mvp,
            camera_pos=all_campos,
            env=ENV,
            materials=materials,
            render_size=render_size, 
            render_mv = all_mv,
            local_normal=True,
            identity_mv=identity_mv,
        )
    else:
        normals = None
        frames = None
        albedos = None


    vertices = torch.from_numpy(vertices).to(device)
    faces = torch.from_numpy(faces).to(device)
    vertices = vertices @ rotate_x(np.pi / 2, device=device)[:3, :3]
    vertices = vertices @ rotate_y(np.pi / 2, device=device)[:3, :3]

    return vertices.cpu(), faces.cpu(), normals, frames, albedos

normal_transfer = NormalTransfer()

def local_normal_global_transform(local_normal_images,azimuths_deg,elevations_deg):
    if local_normal_images.min() >= 0:
        local_normal = local_normal_images.float() * 2 - 1
    else:
        local_normal = local_normal_images.float()
    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
    global_normal[...,0] *= -1
    global_normal = (global_normal + 1) / 2
    global_normal = global_normal.permute(0, 3, 1, 2)
    return global_normal


def isomer_reconstruct(
        rgb_multi_view,
        normal_multi_view,
        multi_view_mask,
        vertices,
        faces,
        save_path=None,
        azimuths=[0, 90, 180, 270],
        elevations=[5, 5, 5, 5],
        geo_weights=[1, 0.9, 1, 0.9],
        color_weights=[1, 0.5, 1, 0.5],
        reconstruction_stage1_steps=10,
        reconstruction_stage2_steps=50,
        radius=4.5):

    device = rgb_multi_view.device
    to_tensor_ = lambda x: torch.Tensor(x).float().to(device)

    # local normal to global normal
    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1).cpu(), to_tensor_(azimuths), to_tensor_(elevations)).to(device)
    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)

    global_normal = global_normal.permute(0,2,3,1)
    multi_view_mask = multi_view_mask.squeeze(1)
    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)

    logger.info(f"==> Runing ISOMER reconstruction ...")
    meshes = reconstruction(
        normal_pils=global_normal, 
        masks=multi_view_mask, 
        weights=to_tensor_(geo_weights), 
        fov=30, 
        radius=radius, 
        camera_angles_azi=to_tensor_(azimuths), 
        camera_angles_ele=to_tensor_(elevations), 
        expansion_weight_stage1=0.1,
        init_type="file",
        init_verts=vertices,
        init_faces=faces,
        stage1_steps=reconstruction_stage1_steps,
        stage2_steps=reconstruction_stage2_steps,
        start_edge_len_stage1=0.1,
        end_edge_len_stage1=0.02,
        start_edge_len_stage2=0.02,
        end_edge_len_stage2=0.005,
    )

    multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-10, blur=5)


    logger.info(f"==> Runing ISOMER projection ...")
    save_glb_addr = projection(
        meshes,
        masks=multi_view_mask_proj.to(device),
        images=rgb_multi_view.to(device),
        azimuths=to_tensor_(azimuths), 
        elevations=to_tensor_(elevations), 
        weights=to_tensor_(color_weights),
        fov=30,
        radius=radius,
        save_dir=TMP_DIR,
        save_glb_addr=save_path
    )

    logger.info(f"==> Save mesh to {save_glb_addr} ...")
    return save_glb_addr


def to_rgb_image(maybe_rgba):
    assert isinstance(maybe_rgba, Image.Image)
    if maybe_rgba.mode == 'RGB':
        return maybe_rgba, None
    elif maybe_rgba.mode == 'RGBA':
        rgba = maybe_rgba
        img = np.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=np.uint8)
        img = Image.fromarray(img, 'RGB')
        img.paste(rgba, mask=rgba.getchannel('A'))
        return img, rgba.getchannel('A')
    else:
        raise ValueError("Unsupported image type.", maybe_rgba.mode)
    
rembg_session = rembg.new_session("u2net")
def preprocess_input_image(input_image):
    """
    input_image: PIL.Image
    output_image: PIL.Image, (3, 512, 512), mode = RGB, background = white
    """
    image = remove_background(to_rgb_image(input_image)[0], rembg_session, bgcolor=(255, 255, 255, 255))
    image = resize_foreground(image, ratio=0.85, pad_value=255)
    return to_rgb_image(image)[0]



@torch.no_grad()
def DiMeR_reconstruct(model, infer_config, texture_model, texture_model_config, images, normals, multi_view_mask,
                    name='', export_texmap=False,
                    input_camera_type='zero123',
                    render_3d_bundle_image=True,
                    render_azimuths=[270, 0, 90, 180], 
                    render_elevations=[5, 5, 5, 5], 
                    render_radius=4.15,
                    camera_radius=3.5):
    """
    images: Tensor, shape (4, c, h, w)
    normals: Tensor, shape (4, c, h, w)
    """
    
    mesh_path_idx = os.path.join(TMP_DIR, f'{name}_recon_from_{input_camera_type}.obj')

    device = normals.device
    if input_camera_type == 'zero123':
        input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=camera_radius, fov=30).to(device)
    elif input_camera_type == 'kiss3d':
        input_cameras = get_flux_input_cameras(batch_size=1, radius=camera_radius, fov=30).to(device)
    else:
        raise NotImplementedError(f'Unexpected input camera type: {input_camera_type}')
    
    # # use rembg to get foreground mask
    # fg_mask = []
    # for i in range(4):
    #     image = images[i].permute(1, 2, 0).cpu().numpy()
    #     image = (image * 255).astype(np.uint8)
    #     image = rembg.remove(image, session=rembg_session)
    #     image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.
    #     image = image[3:4]
    #     fg_mask.append(image)
    # fg_mask = torch.stack(fg_mask)
    # bg_mask = 1 - fg_mask
    
    # TODO: Device Check
    global_normals = normal_transfer.trans_local_2_global(normals.cpu().permute(0,2,3,1), torch.tensor([0, 90, 180, 270]),
                                                         torch.tensor([5, 5, 5, 5]), radius=4.5,
                                                         for_lotus=True).to(device)
    global_normals = global_normals.permute(0, 3, 1, 2)
    global_normals = global_normals * multi_view_mask.to(device) + (1 - multi_view_mask.to(device))
    # global_normals = global_normals * fg_mask + bg_mask
    global_normals = F.pad(global_normals, (50, 50, 50, 50), value=1.)
    global_normals = F.interpolate(global_normals, (512, 512), mode='bilinear', align_corners=False)
    global_normals = global_normals.unsqueeze(0).clamp(0.0, 1.0).to(device)

    print(f"{time.time()} ==> local normal to global normal done")

    # images = images.cpu() * fg_mask + bg_mask
    images = F.pad(images, (50, 50, 50, 50), value=1.)
    images = F.interpolate(images, (512, 512), mode='bilinear', align_corners=False)
    images = images.unsqueeze(0).clamp(0.0, 1.0).to(device)

    print(f"{time.time()} ==> Runing DiMeR geometry reconstruction ...")
    planes = model.forward_planes(global_normals, input_cameras)
    vertices, faces, _  = model.extract_mesh(
            planes,
            use_texture_map=export_texmap,
            **infer_config,
        )
    
    print(f"{time.time()} ==> Runing DiMeR texture reconstruction ...")
    # extract_mesh函数进行了旋转，进行还原，对齐训练时的方向
    vertices = torch.tensor(vertices, device=device)
    faces = torch.tensor(faces, device=device)
    texture_planes = texture_model.forward_planes(images, input_cameras)
    vertex_colors, _, _ = texture_model.synthesizer.get_texture_prediction(
            texture_planes, vertices.unsqueeze(0))
    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]

    vertices = vertices.cpu().numpy()
    faces = faces.cpu().numpy()
    vertex_colors = vertex_colors.clamp(0, 1).squeeze(0).cpu().numpy()
    vertex_colors = vertex_colors * 255.0
    vertex_colors = vertex_colors.astype(np.uint8)
    save_obj(vertices, faces, vertex_colors, mesh_path_idx)
    logger.info(f"Mesh saved to {mesh_path_idx}")
    return mesh_path_idx