Spaces:

hololens
/

stable-diffusion-webui-depthmap-script

Runtime error

File size: 34,243 Bytes

e04dce3

from pathlib import Path
from PIL import Image

try:
    from tqdm import trange
except:
    from builtins import range as trange

import torch, gc
import cv2
import os.path
import numpy as np
import copy
import platform
import math

# Our code
from src.misc import *
from src.common_constants import GenerationOptions as go
from src.common_constants import *
from src.stereoimage_generation import create_stereoimages
from src.normalmap_generation import create_normalmap
from src.depthmap_generation import ModelHolder
from src import backbone

try:
    # 3d-photo-inpainting imports
    from inpaint.mesh import write_mesh, read_mesh, output_3d_photo
    from inpaint.networks import Inpaint_Color_Net, Inpaint_Depth_Net, Inpaint_Edge_Net
    from inpaint.utils import path_planning
    from inpaint.bilateral_filtering import sparse_bilateral_filtering
except Exception as e:
    print('Impaint import failed. Impaint will not work.')
    import traceback
    traceback.print_exc()

global video_mesh_data, video_mesh_fn
video_mesh_data = None
video_mesh_fn = None

model_holder = ModelHolder()


def convert_to_i16(arr):
    # Single channel, 16 bit image. This loses some precision!
    # uint16 conversion uses round-down, therefore values should be [0; 2**16)
    numbytes = 2
    max_val = (2 ** (8 * numbytes))
    out = np.clip(arr * max_val + 0.0001, 0, max_val - 0.1)  # -0.1 from above is needed to avoid overflowing
    return out.astype("uint16")

def convert_i16_to_rgb(image, like):
    # three channel, 8 bits per channel image
    output = np.zeros_like(like)
    output[:, :, 0] = image / 256.0
    output[:, :, 1] = image / 256.0
    output[:, :, 2] = image / 256.0
    return output


class CoreGenerationFunnelInp:
    """This class takes a dictionary and creates a core_generation_funnel inp.

    Non-applicable parameters are silently discarded (no error)"""
    def __init__(self, values):
        if isinstance(values, CoreGenerationFunnelInp):
            values = values.values
        values = {(k.name if isinstance(k, GenerationOptions) else k).lower(): v for k, v in values.items()}

        self.values = {}
        for setting in GenerationOptions:
            name = setting.name.lower()
            self.values[name] = values[name] if name in values else setting.df

    def __getitem__(self, item):
        if isinstance(item, GenerationOptions):
            return self.values[item.name.lower()]
        return self.values[item]

    def __getattr__(self, item):
        return self[item]


def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp, ops=None):
    if len(inputimages) == 0 or inputimages[0] is None:
        return
    if inputdepthmaps is None or len(inputdepthmaps) == 0:
        inputdepthmaps: list[Image] = [None for _ in range(len(inputimages))]
    inputdepthmaps_complete = all([x is not None for x in inputdepthmaps])

    inp = CoreGenerationFunnelInp(inp)

    if ops is None:
        ops = backbone.gather_ops()
    model_holder.update_settings(**ops)

    # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
    print(SCRIPT_FULL_NAME)
    print(f'Backbone: {backbone.USED_BACKBONE.name}')

    backbone.unload_sd_model()

    # TODO: this still should not be here
    background_removed_images = []
    # remove on base image before depth calculation
    if inp[go.GEN_REMBG]:
        if inp[go.PRE_DEPTH_BACKGROUND_REMOVAL]:
            inputimages = batched_background_removal(inputimages, inp[go.REMBG_MODEL])
            background_removed_images = inputimages
        else:
            background_removed_images = batched_background_removal(inputimages, inp[go.REMBG_MODEL])

    # init torch device
    if inp[go.COMPUTE_DEVICE] == 'GPU':
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            print('WARNING: Cuda device was not found, cpu will be used')
            device = torch.device("cpu")
    else:
        device = torch.device("cpu")
    print("device: %s" % device)

    # TODO: This should not be here
    inpaint_imgs = []
    inpaint_depths = []

    try:
        if not inputdepthmaps_complete:
            print("Loading model(s) ..")
            model_holder.ensure_models(inp[go.MODEL_TYPE], device, inp[go.BOOST], inp[go.TILING_MODE])
        print("Computing output(s) ..")
        # iterate over input images
        for count in trange(0, len(inputimages)):
            # Convert single channel input (PIL) images to rgb
            if inputimages[count].mode == 'I':
                inputimages[count].point(lambda p: p * 0.0039063096, mode='RGB')
                inputimages[count] = inputimages[count].convert('RGB')

            raw_prediction = None
            """Raw prediction, as returned by a model. None if input depthmap is used."""
            raw_prediction_invert = False
            """True if near=dark on raw_prediction"""
            out = None

            if inputdepthmaps is not None and inputdepthmaps[count] is not None:
                # use custom depthmap
                dp = inputdepthmaps[count]
                if isinstance(dp, Image.Image):
                    if dp.width != inputimages[count].width or dp.height != inputimages[count].height:
                        try:  # LANCZOS may fail on some formats
                            dp = dp.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
                        except:
                            dp = dp.resize((inputimages[count].width, inputimages[count].height))
                    # Trying desperately to rescale image to [0;1) without actually normalizing it
                    # Normalizing is avoided, because we want to preserve the scale of the original depthmaps
                    # (batch mode, video mode).
                    if len(dp.getbands()) == 1:
                        out = np.asarray(dp, dtype="float")
                        out_max = out.max()
                        if out_max < 256:
                            bit_depth = 8
                        elif out_max < 65536:
                            bit_depth = 16
                        else:
                            bit_depth = 32
                        out /= 2.0 ** bit_depth
                    else:
                        out = np.asarray(dp, dtype="float")[:, :, 0]
                        out /= 256.0
                else:
                    # Should be in interval [0; 1], values outside of this range will be clipped.
                    out = np.asarray(dp, dtype="float")
                    assert inputimages[count].height == out.shape[0], "Custom depthmap height mismatch"
                    assert inputimages[count].width == out.shape[1], "Custom depthmap width mismatch"
            else:
                # override net size (size may be different for different images)
                if inp[go.NET_SIZE_MATCH]:
                    # Round up to a multiple of 32 to avoid potential issues
                    # TODO: buggs for Depth Anything
                    net_width = (inputimages[count].width + 31) // 32 * 32
                    net_height = (inputimages[count].height + 31) // 32 * 32
                else:
                    net_width = inp[go.NET_WIDTH]
                    net_height = inp[go.NET_HEIGHT]
                raw_prediction, raw_prediction_invert = \
                    model_holder.get_raw_prediction(inputimages[count], net_width, net_height)

                # output
                if abs(raw_prediction.max() - raw_prediction.min()) > np.finfo("float").eps:
                    out = np.copy(raw_prediction)
                    # TODO: some models may output negative values, maybe these should be clamped to zero.
                    if raw_prediction_invert:
                        out *= -1
                    if inp[go.DO_OUTPUT_DEPTH_PREDICTION]:
                        yield count, 'depth_prediction', np.copy(out)
                    if inp[go.CLIPDEPTH]:
                        if inp[go.CLIPDEPTH_MODE] == 'Range':
                            out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
                            out = np.clip(out, inp[go.CLIPDEPTH_FAR], inp[go.CLIPDEPTH_NEAR])
                        elif inp[go.CLIPDEPTH_MODE] == 'Outliers':
                            fb, nb = np.percentile(out, [inp[go.CLIPDEPTH_FAR] * 100.0, inp[go.CLIPDEPTH_NEAR] * 100.0])
                            out = np.clip(out, fb, nb)
                    out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
                else:
                    # Regretfully, the depthmap is broken and will be replaced with a black image
                    out = np.zeros(raw_prediction.shape)

            # Maybe we should not use img_output for everything, since we get better accuracy from
            # the raw_prediction. However, it is not always supported. We maybe would like to achieve
            # reproducibility, so depthmap of the image should be the same as generating the depthmap one more time.
            img_output = convert_to_i16(out)
            """Depthmap (near=bright), as uint16"""

            # if 3dinpainting, store maps for processing in second pass
            if inp[go.GEN_INPAINTED_MESH]:
                inpaint_imgs.append(inputimages[count])
                inpaint_depths.append(img_output)

            # applying background masks after depth
            if inp[go.GEN_REMBG]:
                print('applying background masks')
                background_removed_image = background_removed_images[count]
                # maybe a threshold cut would be better on the line below.
                background_removed_array = np.array(background_removed_image)
                bg_mask = (background_removed_array[:, :, 0] == 0) & (background_removed_array[:, :, 1] == 0) & (
                        background_removed_array[:, :, 2] == 0) & (background_removed_array[:, :, 3] <= 0.2)
                img_output[bg_mask] = 0  # far value

                yield count, 'background_removed', background_removed_image

                if inp[go.SAVE_BACKGROUND_REMOVAL_MASKS]:
                    bg_array = (1 - bg_mask.astype('int8')) * 255
                    mask_array = np.stack((bg_array, bg_array, bg_array, bg_array), axis=2)
                    mask_image = Image.fromarray(mask_array.astype(np.uint8))

                    yield count, 'foreground_mask', mask_image

            # A weird quirk: if user tries to save depthmap, whereas custom depthmap is used,
            # custom depthmap will be outputed
            if inp[go.DO_OUTPUT_DEPTH]:
                img_depth = cv2.bitwise_not(img_output) if inp[go.OUTPUT_DEPTH_INVERT] else img_output
                if inp[go.OUTPUT_DEPTH_COMBINE]:
                    axis = 1 if inp[go.OUTPUT_DEPTH_COMBINE_AXIS] == 'Horizontal' else 0
                    img_concat = Image.fromarray(np.concatenate(
                        (inputimages[count], convert_i16_to_rgb(img_depth, inputimages[count])),
                        axis=axis))
                    yield count, 'concat_depth', img_concat
                else:
                    yield count, 'depth', Image.fromarray(img_depth)

            if inp[go.GEN_STEREO]:
                # print("Generating stereoscopic image(s)..")
                stereoimages = create_stereoimages(
                    inputimages[count], img_output,
                    inp[go.STEREO_DIVERGENCE], inp[go.STEREO_SEPARATION],
                    inp[go.STEREO_MODES],
                    inp[go.STEREO_BALANCE], inp[go.STEREO_OFFSET_EXPONENT], inp[go.STEREO_FILL_ALGO])
                for c in range(0, len(stereoimages)):
                    yield count, inp[go.STEREO_MODES][c], stereoimages[c]

            if inp[go.GEN_NORMALMAP]:
                normalmap = create_normalmap(
                    img_output,
                    inp[go.NORMALMAP_PRE_BLUR_KERNEL] if inp[go.NORMALMAP_PRE_BLUR] else None,
                    inp[go.NORMALMAP_SOBEL_KERNEL] if inp[go.NORMALMAP_SOBEL] else None,
                    inp[go.NORMALMAP_POST_BLUR_KERNEL] if inp[go.NORMALMAP_POST_BLUR] else None,
                    inp[go.NORMALMAP_INVERT]
                )
                yield count, 'normalmap', normalmap

            if inp[go.GEN_HEATMAP]:
                from dzoedepth.utils.misc import colorize
                heatmap = Image.fromarray(colorize(img_output, cmap='inferno'))
                yield count, 'heatmap', heatmap

            # gen mesh
            if inp[go.GEN_SIMPLE_MESH]:
                print(f"\nGenerating (occluded) mesh ..")
                basename = 'depthmap'
                meshsimple_fi = get_uniquefn(outpath, basename, 'obj', 'simple')

                depthi = raw_prediction if raw_prediction is not None else out
                depthi_min, depthi_max = depthi.min(), depthi.max()
                # try to map output to sensible values for non zoedepth models, boost, or custom maps
                if inp[go.MODEL_TYPE] not in [7, 8, 9] or inp[go.BOOST] or inputdepthmaps[count] is not None:
                    # invert if midas
                    if inp[go.MODEL_TYPE] > 0 or inputdepthmaps[count] is not None:  # TODO: Weird
                        depthi = depthi_max - depthi + depthi_min
                        depth_max = depthi.max()
                        depth_min = depthi.min()
                    # make positive
                    if depthi_min < 0:
                        depthi = depthi - depthi_min
                        depth_max = depthi.max()
                        depth_min = depthi.min()
                    # scale down
                    if depthi.max() > 10.0:
                        depthi = 4.0 * (depthi - depthi_min) / (depthi_max - depthi_min)
                    # offset
                    depthi = depthi + 1.0

                mesh = create_mesh(inputimages[count], depthi, keep_edges=not inp[go.SIMPLE_MESH_OCCLUDE],
                                   spherical=(inp[go.SIMPLE_MESH_SPHERICAL]))
                mesh.export(meshsimple_fi)
                yield count, 'simple_mesh', meshsimple_fi

        print("Computing output(s) done.")
    except Exception as e:
        import traceback
        if 'out of memory' in str(e).lower():
            print(str(e))
            suggestion = "out of GPU memory, could not generate depthmap! " \
                         "Here are some suggestions to work around this issue:\n"
            if inp[go.BOOST]:
                suggestion += " * Disable BOOST (generation will be faster, but the depthmap will be less detailed)\n"
            if backbone.USED_BACKBONE != backbone.BackboneType.STANDALONE:
                suggestion += " * Run DepthMap in the standalone mode - without launching the SD WebUI\n"
            if device != torch.device("cpu"):
                suggestion += " * Select CPU as the processing device (this will be slower)\n"
            if inp[go.MODEL_TYPE] != 6:
                suggestion +=\
                    " * Use a different model (generally, more memory-consuming models produce better depthmaps)\n"
            if not inp[go.BOOST]:
                suggestion += " * Reduce net size (this could reduce quality)\n"
            print('Fail.\n')
            raise Exception(suggestion)
        else:
            print('Fail.\n')
            raise e
    finally:
        if backbone.get_opt('depthmap_script_keepmodels', True):
            model_holder.offload()  # Swap to CPU memory
        else:
            model_holder.unload_models()
        gc.collect()
        backbone.torch_gc()

    # TODO: This should not be here
    if inp[go.GEN_INPAINTED_MESH]:
        try:
            mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath,
                                  inp[go.GEN_INPAINTED_MESH_DEMOS],
                                  1, "mp4")
            yield 0, 'inpainted_mesh', mesh_fi
        except Exception as e:
            print(f'{str(e)}, some issue with generating inpainted mesh')

    backbone.reload_sd_model()
    print("All done.\n")


def get_uniquefn(outpath, basename, ext, suffix=''):
    basecount = backbone.get_next_sequence_number(outpath, basename)
    if basecount > 0:
        basecount -= 1
    if suffix != '':
        suffix = f'-{suffix}'  # Dash is important for selecting unique filenames (see get_next_sequence_number)
    for i in range(500):
        fullfn = os.path.join(outpath, f"{basename}-{basecount + i:04}{suffix}.{ext}")
        if not os.path.exists(fullfn):
            return fullfn
    return f"{basename}-99999{suffix}.{ext}"  # Failback, should never be executed


def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, gen_inpainted_mesh_demos, vid_ssaa, vid_format):
    mesh_fi = ''
    try:
        print("Running 3D Photo Inpainting .. ")
        edgemodel_path = './models/3dphoto/edge_model.pth'
        depthmodel_path = './models/3dphoto/depth_model.pth'
        colormodel_path = './models/3dphoto/color_model.pth'
        # create paths to model if not present
        os.makedirs('./models/3dphoto/', exist_ok=True)

        ensure_file_downloaded(
            edgemodel_path,
            ["https://huggingface.co/spaces/Epoching/3D_Photo_Inpainting/resolve/e389e564fd2a55cf/checkpoints/edge-model.pth",
             "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth"],
            "b1d768bd008ad5fe9f540004f870b8c3d355e4939b2009aa4db493fd313217c9")
        ensure_file_downloaded(
            depthmodel_path,
            ["https://huggingface.co/spaces/Epoching/3D_Photo_Inpainting/resolve/e389e564fd2a55cf/checkpoints/depth-model.pth",
             "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth"],
            "2d0e63e89a22762ddfa8bc8c9f8c992e5532b140123274ffc6e4171baa1b76f8")
        ensure_file_downloaded(
            colormodel_path,
            ["https://huggingface.co/spaces/Epoching/3D_Photo_Inpainting/resolve/e389e564fd2a55cf/checkpoints/color-model.pth",
             "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth"],
            "383c9b1db70097907a6f9c8abb0303e7056f50d5456a36f34ab784592b8b2c20"
        )

        print("Loading edge model ..")
        depth_edge_model = Inpaint_Edge_Net(init_weights=True)
        depth_edge_weight = torch.load(edgemodel_path, map_location=torch.device(device))
        depth_edge_model.load_state_dict(depth_edge_weight)
        depth_edge_model = depth_edge_model.to(device)
        depth_edge_model.eval()
        print("Loading depth model ..")
        depth_feat_model = Inpaint_Depth_Net()
        depth_feat_weight = torch.load(depthmodel_path, map_location=torch.device(device))
        depth_feat_model.load_state_dict(depth_feat_weight, strict=True)
        depth_feat_model = depth_feat_model.to(device)
        depth_feat_model.eval()
        depth_feat_model = depth_feat_model.to(device)
        print("Loading rgb model ..")
        rgb_model = Inpaint_Color_Net()
        rgb_feat_weight = torch.load(colormodel_path, map_location=torch.device(device))
        rgb_model.load_state_dict(rgb_feat_weight)
        rgb_model.eval()
        rgb_model = rgb_model.to(device)

        config = {}
        config["gpu_ids"] = 0
        config['extrapolation_thickness'] = 60
        config['extrapolate_border'] = True
        config['depth_threshold'] = 0.04
        config['redundant_number'] = 12
        config['ext_edge_threshold'] = 0.002
        config['background_thickness'] = 70
        config['context_thickness'] = 140
        config['background_thickness_2'] = 70
        config['context_thickness_2'] = 70
        config['log_depth'] = True
        config['depth_edge_dilate'] = 10
        config['depth_edge_dilate_2'] = 5
        config['largest_size'] = 512
        config['repeat_inpaint_edge'] = True
        config['ply_fmt'] = "bin"

        config['save_ply'] = backbone.get_opt('depthmap_script_save_ply', False)
        config['save_obj'] = True

        if device == torch.device("cpu"):
            config["gpu_ids"] = -1

        for count in trange(0, len(img_rgb)):
            basename = 'depthmap'
            if inputnames is not None:
                if inputnames[count] is not None:
                    p = Path(inputnames[count])
                    basename = p.stem

            mesh_fi = get_uniquefn(outpath, basename, 'obj')

            print(f"\nGenerating inpainted mesh .. (go make some coffee) ..")

            # from inpaint.utils.get_MiDaS_samples
            W = img_rgb[count].width
            H = img_rgb[count].height
            int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
            if int_mtx.max() > 1:
                int_mtx[0, :] = int_mtx[0, :] / float(W)
                int_mtx[1, :] = int_mtx[1, :] / float(H)

            # how inpaint.utils.read_MiDaS_depth() imports depthmap
            disp = img_depth[count].astype(np.float32)
            disp = disp - disp.min()
            disp = cv2.blur(disp / disp.max(), ksize=(3, 3)) * disp.max()
            disp = (disp / disp.max()) * 3.0
            depth = 1. / np.maximum(disp, 0.05)

            # rgb input
            img = np.asarray(img_rgb[count])
            if len(img.shape) > 2 and img.shape[2] == 4:
                # convert the image from RGBA2RGB
                img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)

            # run sparse bilateral filter
            config['sparse_iter'] = 5
            config['filter_size'] = [7, 7, 5, 5, 5]
            config['sigma_s'] = 4.0
            config['sigma_r'] = 0.5
            vis_photos, vis_depths = sparse_bilateral_filtering(depth.copy(), img.copy(), config,
                                                                num_iter=config['sparse_iter'], spdb=False)
            depth = vis_depths[-1]

            # bilat_fn = os.path.join(outpath, basename +'_bilatdepth.png')
            # cv2.imwrite(bilat_fn, depth)

            rt_info = write_mesh(img,
                                 depth,
                                 int_mtx,
                                 mesh_fi,
                                 config,
                                 rgb_model,
                                 depth_edge_model,
                                 depth_edge_model,
                                 depth_feat_model)

            if rt_info is not False and gen_inpainted_mesh_demos:
                run_3dphoto_videos(mesh_fi, basename, outpath, 300, 40,
                                   [0.03, 0.03, 0.05, 0.03],
                                   ['double-straight-line', 'double-straight-line', 'circle', 'circle'],
                                   [0.00, 0.00, -0.015, -0.015],
                                   [0.00, 0.00, -0.015, -0.00],
                                   [-0.05, -0.05, -0.05, -0.05],
                                   ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, vid_format, vid_ssaa)

            backbone.torch_gc()

    finally:
        del rgb_model
        rgb_model = None
        del depth_edge_model
        depth_edge_model = None
        del depth_feat_model
        depth_feat_model = None
        backbone.torch_gc()

    return mesh_fi


def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range,

                       y_shift_range, z_shift_range, video_postfix, vid_dolly, vid_format, vid_ssaa):
    import vispy
    try:
        if platform.system() == 'Windows':
            vispy.use(app='PyQt5')
        elif platform.system() == 'Darwin':
            vispy.use('PyQt6')
        else:
            vispy.use(app='egl')
    except:
        import traceback
        print(traceback.format_exc())
        print('Trying an alternative...')
        for u in ['PyQt5', 'PyQt6', 'egl']:
            try:
                vispy.use(app=u)
                break
            except:
                print(f'On {u}')
                print(traceback.format_exc())
        # Honestly, I don't know if it actually helps at all

    # read ply
    global video_mesh_data, video_mesh_fn
    if video_mesh_fn is None or video_mesh_fn != mesh_fi:
        try:
            del video_mesh_data
        except:
            print("del video_mesh_data failed")
        video_mesh_fn = mesh_fi
        video_mesh_data = read_mesh(mesh_fi)

    verts, colors, faces, Height, Width, hFov, vFov, mean_loc_depth = video_mesh_data

    original_w = output_w = W = Width
    original_h = output_h = H = Height
    int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
    if int_mtx.max() > 1:
        int_mtx[0, :] = int_mtx[0, :] / float(W)
        int_mtx[1, :] = int_mtx[1, :] / float(H)

    config = {}
    config['video_folder'] = outpath
    config['num_frames'] = num_frames
    config['fps'] = fps
    config['crop_border'] = crop_border
    config['traj_types'] = traj_types
    config['x_shift_range'] = x_shift_range
    config['y_shift_range'] = y_shift_range
    config['z_shift_range'] = z_shift_range
    config['video_postfix'] = video_postfix
    config['ssaa'] = vid_ssaa

    # from inpaint.utils.get_MiDaS_samples
    generic_pose = np.eye(4)
    assert len(config['traj_types']) == len(config['x_shift_range']) == \
           len(config['y_shift_range']) == len(config['z_shift_range']) == len(config['video_postfix']), \
        "The number of elements in 'traj_types', 'x_shift_range', 'y_shift_range', 'z_shift_range' and \

            'video_postfix' should be equal."
    tgt_pose = [[generic_pose * 1]]
    tgts_poses = []
    for traj_idx in range(len(config['traj_types'])):
        tgt_poses = []
        sx, sy, sz = path_planning(config['num_frames'], config['x_shift_range'][traj_idx],
                                   config['y_shift_range'][traj_idx],
                                   config['z_shift_range'][traj_idx], path_type=config['traj_types'][traj_idx])
        for xx, yy, zz in zip(sx, sy, sz):
            tgt_poses.append(generic_pose * 1.)
            tgt_poses[-1][:3, -1] = np.array([xx, yy, zz])
        tgts_poses += [tgt_poses]
    tgt_pose = generic_pose * 1

    # seems we only need the depthmap to calc mean_loc_depth, which is only used when doing 'dolly'
    # width and height are already in the ply file in the comments ..
    # might try to add the mean_loc_depth to it too
    # did just that
    # mean_loc_depth = img_depth[img_depth.shape[0]//2, img_depth.shape[1]//2]

    print("Generating videos ..")

    normal_canvas, all_canvas = None, None
    videos_poses, video_basename = copy.deepcopy(tgts_poses), basename
    top = (original_h // 2 - int_mtx[1, 2] * output_h)
    left = (original_w // 2 - int_mtx[0, 2] * output_w)
    down, right = top + output_h, left + output_w
    border = [int(xx) for xx in [top, down, left, right]]
    normal_canvas, all_canvas, fn_saved = output_3d_photo(verts.copy(), colors.copy(), faces.copy(),
                                                          copy.deepcopy(Height), copy.deepcopy(Width),
                                                          copy.deepcopy(hFov), copy.deepcopy(vFov),
                                                          copy.deepcopy(tgt_pose), config['video_postfix'],
                                                          copy.deepcopy(generic_pose),
                                                          copy.deepcopy(config['video_folder']),
                                                          None, copy.deepcopy(int_mtx), config, None,
                                                          videos_poses, video_basename, original_h, original_w,
                                                          border=border, depth=None, normal_canvas=normal_canvas,
                                                          all_canvas=all_canvas,
                                                          mean_loc_depth=mean_loc_depth, dolly=vid_dolly,
                                                          fnExt=vid_format)
    return fn_saved

def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_border, dolly, vid_format, vid_ssaa,

                  outpath=None, basename=None):
    if len(fn_mesh) == 0 or not os.path.exists(fn_mesh):
        raise Exception("Could not open mesh.")

    vid_ssaa = int(vid_ssaa)

    # traj type
    if vid_traj == 0:
        vid_traj = ['straight-line']
    elif vid_traj == 1:
        vid_traj = ['double-straight-line']
    elif vid_traj == 2:
        vid_traj = ['circle']

    num_fps = int(vid_fps)
    num_frames = int(vid_numframes)
    shifts = vid_shift.split(',')
    if len(shifts) != 3:
        raise Exception("Translate requires 3 elements.")
    x_shift_range = [float(shifts[0])]
    y_shift_range = [float(shifts[1])]
    z_shift_range = [float(shifts[2])]

    borders = vid_border.split(',')
    if len(borders) != 4:
        raise Exception("Crop Border requires 4 elements.")
    crop_border = [float(borders[0]), float(borders[1]), float(borders[2]), float(borders[3])]

    if not outpath:
        outpath = backbone.get_outpath()

    if not basename:
        # output path and filename mess ..
        basename = Path(fn_mesh).stem
        
        # unique filename
        basecount = backbone.get_next_sequence_number(outpath, basename)
        if basecount > 0: basecount = basecount - 1
        fullfn = None
        for i in range(500):
            fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
            fullfn = os.path.join(outpath, f"{fn}_." + vid_format)
            if not os.path.exists(fullfn):
                break
        basename = Path(fullfn).stem
        basename = basename[:-1]

    print("Loading mesh ..")

    fn_saved = run_3dphoto_videos(fn_mesh, basename, outpath, num_frames, num_fps, crop_border, vid_traj, x_shift_range,
                                  y_shift_range, z_shift_range, [''], dolly, vid_format, vid_ssaa)

    return fn_saved[-1], fn_saved[-1], ''

def unload_models():
    model_holder.unload_models()


# TODO: code borrowed from the internet to be marked as such and to reside in separate files

def batched_background_removal(inimages, model_name):
    from rembg import new_session, remove
    print('creating background masks')
    outimages = []

    # model path and name
    bg_model_dir = Path.joinpath(Path().resolve(), "models/rem_bg")
    os.makedirs(bg_model_dir, exist_ok=True)
    os.environ["U2NET_HOME"] = str(bg_model_dir)

    # starting a session
    background_removal_session = new_session(model_name)
    for count in range(0, len(inimages)):
        bg_remove_img = np.array(remove(inimages[count], session=background_removal_session))
        outimages.append(Image.fromarray(bg_remove_img))
    # The line below might be redundant
    del background_removal_session
    return outimages


def pano_depth_to_world_points(depth):
    """

    360 depth to world points

    given 2D depth is an equirectangular projection of a spherical image

    Treat depth as radius

    longitude : -pi to pi

    latitude : -pi/2 to pi/2

    """

    # Convert depth to radius
    radius = depth.flatten()

    lon = np.linspace(-np.pi, np.pi, depth.shape[1])
    lat = np.linspace(-np.pi / 2, np.pi / 2, depth.shape[0])

    lon, lat = np.meshgrid(lon, lat)
    lon = lon.flatten()
    lat = lat.flatten()

    # Convert to cartesian coordinates
    x = radius * np.cos(lat) * np.cos(lon)
    y = radius * np.cos(lat) * np.sin(lon)
    z = radius * np.sin(lat)

    pts3d = np.stack([x, y, z], axis=1)

    return pts3d


def depth_edges_mask(depth):
    """Returns a mask of edges in the depth map.

    Args:

    depth: 2D numpy array of shape (H, W) with dtype float32.

    Returns:

    mask: 2D numpy array of shape (H, W) with dtype bool.

    """
    # Compute the x and y gradients of the depth map.
    depth_dx, depth_dy = np.gradient(depth)
    # Compute the gradient magnitude.
    depth_grad = np.sqrt(depth_dx ** 2 + depth_dy ** 2)
    # Compute the edge mask.
    mask = depth_grad > 0.05
    return mask


def create_mesh(image, depth, keep_edges=False, spherical=False):
    import trimesh
    from dzoedepth.utils.geometry import depth_to_points, create_triangles
    maxsize = backbone.get_opt('depthmap_script_mesh_maxsize', 2048)

    # limit the size of the input image
    image.thumbnail((maxsize, maxsize))

    if not spherical:
        pts3d = depth_to_points(depth[None])
    else:
        pts3d = pano_depth_to_world_points(depth)

    pts3d = pts3d.reshape(-1, 3)

    verts = pts3d.reshape(-1, 3)
    image = np.array(image)
    if keep_edges:
        triangles = create_triangles(image.shape[0], image.shape[1])
    else:
        triangles = create_triangles(image.shape[0], image.shape[1], mask=~depth_edges_mask(depth))
    colors = image.reshape(-1, 3)

    mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)

    # rotate 90deg over X when spherical
    if spherical:
        angle = math.pi / 2
        direction = [1, 0, 0]
        center = [0, 0, 0]
        rot_matrix = trimesh.transformations.rotation_matrix(angle, direction, center)
        mesh.apply_transform(rot_matrix)

    return mesh