Spaces:

Chaerin5
/

FoundHand

Running on Zero

App Files Files Community

Chaerin5 commited on Mar 30

Commit

6aa317f

1 Parent(s): fe8403c

instruction renovation; allow manual keypoints at edit hands

Browse files

Files changed (2) hide show

app_regular_gpu.py +2003 -0
no_hands.png +3 -0

app_regular_gpu.py ADDED Viewed

	@@ -0,0 +1,2003 @@

+import os
+import torch
+from dataclasses import dataclass
+import gradio as gr
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+import mediapipe as mp
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+import vqvae
+import vit
+from typing import Literal
+from diffusion import create_diffusion
+from utils import scale_keypoint, keypoint_heatmap, check_keypoints_validity
+from segment_hoi import init_sam
+from io import BytesIO
+from PIL import Image
+import random
+from copy import deepcopy
+from typing import Optional
+import requests
+from huggingface_hub import hf_hub_download
+# import spaces
+MAX_N = 6
+FIX_MAX_N = 6
+placeholder = cv2.cvtColor(cv2.imread("placeholder.png"), cv2.COLOR_BGR2RGB)
+NEW_MODEL = True
+MODEL_EPOCH = 6
+REF_POSE_MASK = True
+def set_seed(seed):
+    seed = int(seed)
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+# if torch.cuda.is_available():
+device = "cuda"
+# else:
+    # device = "cpu"
+def remove_prefix(text, prefix):
+    if text.startswith(prefix):
+        return text[len(prefix) :]
+    return text
+def unnormalize(x):
+    return (((x + 1) / 2) * 255).astype(np.uint8)
+def visualize_hand(all_joints, img, side=["right", "left"], n_avail_joints=21):
+    # Define the connections between joints for drawing lines and their corresponding colors
+    connections = [
+        ((0, 1), "red"),
+        ((1, 2), "green"),
+        ((2, 3), "blue"),
+        ((3, 4), "purple"),
+        ((0, 5), "orange"),
+        ((5, 6), "pink"),
+        ((6, 7), "brown"),
+        ((7, 8), "cyan"),
+        ((0, 9), "yellow"),
+        ((9, 10), "magenta"),
+        ((10, 11), "lime"),
+        ((11, 12), "indigo"),
+        ((0, 13), "olive"),
+        ((13, 14), "teal"),
+        ((14, 15), "navy"),
+        ((15, 16), "gray"),
+        ((0, 17), "lavender"),
+        ((17, 18), "silver"),
+        ((18, 19), "maroon"),
+        ((19, 20), "fuchsia"),
+    ]
+    H, W, C = img.shape
+    # Create a figure and axis
+    plt.figure()
+    ax = plt.gca()
+    # Plot joints as points
+    ax.imshow(img)
+    start_is = []
+    if "right" in side:
+        start_is.append(0)
+    if "left" in side:
+        start_is.append(21)
+    for start_i in start_is:
+        joints = all_joints[start_i : start_i + n_avail_joints]
+        if len(joints) == 1:
+            ax.scatter(joints[0][0], joints[0][1], color="red", s=10)
+        else:
+            for connection, color in connections[: len(joints) - 1]:
+                joint1 = joints[connection[0]]
+                joint2 = joints[connection[1]]
+                ax.plot([joint1[0], joint2[0]], [joint1[1], joint2[1]], color=color)
+    ax.set_xlim([0, W])
+    ax.set_ylim([0, H])
+    ax.grid(False)
+    ax.set_axis_off()
+    ax.invert_yaxis()
+    # plt.subplots_adjust(wspace=0.01)
+    # plt.show()
+    buf = BytesIO()
+    plt.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
+    plt.close()
+    # Convert BytesIO object to numpy array
+    buf.seek(0)
+    img_pil = Image.open(buf)
+    img_pil = img_pil.resize((H, W))
+    numpy_img = np.array(img_pil)
+    return numpy_img
+def mask_image(image, mask, color=[0, 0, 0], alpha=0.6, transparent=True):
+    """Overlay mask on image for visualization purpose.
+    Args:
+        image (H, W, 3) or (H, W): input image
+        mask (H, W): mask to be overlaid
+        color: the color of overlaid mask
+        alpha: the transparency of the mask
+    """
+    out = deepcopy(image)
+    img = deepcopy(image)
+    img[mask == 1] = color
+    if transparent:
+        out = cv2.addWeighted(img, alpha, out, 1 - alpha, 0, out)
+    else:
+        out = img
+    return out
+def scale_keypoint(keypoint, original_size, target_size):
+    """Scale a keypoint based on the resizing of the image."""
+    keypoint_copy = keypoint.copy()
+    keypoint_copy[:, 0] *= target_size[0] / original_size[0]
+    keypoint_copy[:, 1] *= target_size[1] / original_size[1]
+    return keypoint_copy
+print("Configure...")
+@dataclass
+class HandDiffOpts:
+    run_name: str = "ViT_256_handmask_heatmap_nvs_b25_lr1e-5"
+    sd_path: str = "/users/kchen157/scratch/weights/SD/sd-v1-4.ckpt"
+    log_dir: str = "/users/kchen157/scratch/log"
+    data_root: str = "/users/kchen157/data/users/kchen157/dataset/handdiff"
+    image_size: tuple = (256, 256)
+    latent_size: tuple = (32, 32)
+    latent_dim: int = 4
+    mask_bg: bool = False
+    kpts_form: str = "heatmap"
+    n_keypoints: int = 42
+    n_mask: int = 1
+    noise_steps: int = 1000
+    test_sampling_steps: int = 250
+    ddim_steps: int = 100
+    ddim_discretize: str = "uniform"
+    ddim_eta: float = 0.0
+    beta_start: float = 8.5e-4
+    beta_end: float = 0.012
+    latent_scaling_factor: float = 0.18215
+    cfg_pose: float = 5.0
+    cfg_appearance: float = 3.5
+    batch_size: int = 25
+    lr: float = 1e-5
+    max_epochs: int = 500
+    log_every_n_steps: int = 100
+    limit_val_batches: int = 1
+    n_gpu: int = 8
+    num_nodes: int = 1
+    precision: str = "16-mixed"
+    profiler: str = "simple"
+    swa_epoch_start: int = 10
+    swa_lrs: float = 1e-3
+    num_workers: int = 10
+    n_val_samples: int = 4
+# load models
+token = os.getenv("HF_TOKEN")
+if NEW_MODEL:
+    opts = HandDiffOpts()
+    if MODEL_EPOCH == 7:
+        model_path = './DINO_EMA_11M_b50_lr1e-5_epoch7_step380k.ckpt'
+    elif MODEL_EPOCH == 6:
+        # model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt"
+        model_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt", token=token)
+    elif MODEL_EPOCH == 4:
+        model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch4_step210k.ckpt"
+    elif MODEL_EPOCH == 10:
+        model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch10_step550k.ckpt"
+    else:
+        raise ValueError(f"new model epoch should be either 6 or 7, got {MODEL_EPOCH}")
+    # vae_path = './vae-ft-mse-840000-ema-pruned.ckpt'
+    vae_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="vae-ft-mse-840000-ema-pruned.ckpt", token=token)
+    # sd_path = './sd-v1-4.ckpt'
+    print('Load diffusion model...')
+    diffusion = create_diffusion(str(opts.test_sampling_steps))
+    model = vit.DiT_XL_2(
+        input_size=opts.latent_size[0],
+        latent_dim=opts.latent_dim,
+        in_channels=opts.latent_dim+opts.n_keypoints+opts.n_mask,
+        learn_sigma=True,
+    ).to(device)
+    # ckpt_state_dict = torch.load(model_path)['model_state_dict']
+    ckpt_state_dict = torch.load(model_path, map_location='cpu')['ema_state_dict']
+    missing_keys, extra_keys = model.load_state_dict(ckpt_state_dict, strict=False)
+    model = model.to(device)
+    model.eval()
+    print(missing_keys, extra_keys)
+    assert len(missing_keys) == 0
+    vae_state_dict = torch.load(vae_path, map_location='cpu')['state_dict']
+    print(f"vae_state_dict encoder dtype: {vae_state_dict['encoder.conv_in.weight'].dtype}")
+    autoencoder = vqvae.create_model(3, 3, opts.latent_dim).eval().requires_grad_(False)
+    print(f"autoencoder encoder dtype: {next(autoencoder.encoder.parameters()).dtype}")
+    print(f"encoder before load_state_dict parameters min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
+    print(f"encoder before load_state_dict parameters max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
+    missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
+    print(f"encoder after load_state_dict parameters min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
+    print(f"encoder after load_state_dict parameters max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
+    autoencoder = autoencoder.to(device)
+    autoencoder.eval()
+    print(f"encoder after eval() min: {min([p.min() for p in autoencoder.encoder.parameters()])}")
+    print(f"encoder after eval() max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
+    print(f"autoencoder encoder after eval() dtype: {next(autoencoder.encoder.parameters()).dtype}")
+    assert len(missing_keys) == 0
+# else:
+#     opts = HandDiffOpts()
+#     model_path = './finetune_epoch=5-step=130000.ckpt'
+#     sd_path = './sd-v1-4.ckpt'
+#     print('Load diffusion model...')
+#     diffusion = create_diffusion(str(opts.test_sampling_steps))
+#     model = vit.DiT_XL_2(
+#         input_size=opts.latent_size[0],
+#         latent_dim=opts.latent_dim,
+#         in_channels=opts.latent_dim+opts.n_keypoints+opts.n_mask,
+#         learn_sigma=True,
+#     ).to(device)
+#     ckpt_state_dict = torch.load(model_path)['state_dict']
+#     dit_state_dict = {remove_prefix(k, 'diffusion_backbone.'): v for k, v in ckpt_state_dict.items() if k.startswith('diffusion_backbone')}
+#     vae_state_dict = {remove_prefix(k, 'autoencoder.'): v for k, v in ckpt_state_dict.items() if k.startswith('autoencoder')}
+#     missing_keys, extra_keys = model.load_state_dict(dit_state_dict, strict=False)
+#     model.eval()
+#     assert len(missing_keys) == 0 and len(extra_keys) == 0
+#     autoencoder = vqvae.create_model(3, 3, opts.latent_dim).eval().requires_grad_(False).to(device)
+#     missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
+#     autoencoder.eval()
+#     assert len(missing_keys) == 0 and len(extra_keys) == 0
+sam_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="sam_vit_h_4b8939.pth", token=token)
+sam_predictor = init_sam(ckpt_path=sam_path, device='cuda')
+print("Mediapipe hand detector and SAM ready...")
+mp_hands = mp.solutions.hands
+hands = mp_hands.Hands(
+    static_image_mode=True,  # Use False if image is part of a video stream
+    max_num_hands=2,  # Maximum number of hands to detect
+    min_detection_confidence=0.1,
+)
+def prepare_ref_anno(ref):
+    if ref is None:
+        return (
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+    missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
+    img = ref["composite"][..., :3]
+    img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
+    keypts = np.zeros((42, 2))
+    # if REF_POSE_MASK:
+    mp_pose = hands.process(img)
+    # detected = np.array([0, 0])
+    # start_idx = 0
+    if mp_pose.multi_hand_landmarks:
+        # handedness is flipped assuming the input image is mirrored in MediaPipe
+        for hand_landmarks, handedness in zip(
+            mp_pose.multi_hand_landmarks, mp_pose.multi_handedness
+        ):
+            # actually right hand
+            if handedness.classification[0].label == "Left":
+                start_idx = 0
+                # detected[0] = 1
+            # actually left hand
+            elif handedness.classification[0].label == "Right":
+                start_idx = 21
+                # detected[1] = 1
+            for i, landmark in enumerate(hand_landmarks.landmark):
+                keypts[start_idx + i] = [
+                    landmark.x * opts.image_size[1],
+                    landmark.y * opts.image_size[0],
+                ]
+        # sam_predictor.set_image(img)
+        # l = keypts[:21].shape[0]
+        # if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+        #     input_point = np.array([keypts[0], keypts[21]])
+        #     input_label = np.array([1, 1])
+        # elif keypts[0].sum() != 0:
+        #     input_point = np.array(keypts[:1])
+        #     input_label = np.array([1])
+        # elif keypts[21].sum() != 0:
+        #     input_point = np.array(keypts[21:22])
+        #     input_label = np.array([1])
+        # masks, _, _ = sam_predictor.predict(
+        #     point_coords=input_point,
+        #     point_labels=input_label,
+        #     multimask_output=False,
+        # )
+        # hand_mask = masks[0]
+        # masked_img = img * hand_mask[..., None] + 255 * (1 - hand_mask[..., None])
+        # ref_pose = visualize_hand(keypts, masked_img)
+        print(f"keypts.max(): {keypts.max()}, keypts.min(): {keypts.min()}")
+        return img, keypts
+    else:
+        return img, None
+        # raise gr.Error("No hands detected in the reference image.")
+    # else:
+    #     hand_mask = np.zeros_like(img[:,:, 0])
+    #     ref_pose = np.zeros_like(img)
+def get_ref_anno(img, keypts):
+    if keypts is None:
+        no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH))
+        return None, no_hands, None
+    if isinstance(keypts, list):
+        if len(keypts[0]) == 0:
+            keypts[0] = np.zeros((21, 2))
+        elif len(keypts[0]) == 21:
+            keypts[0] = np.array(keypts[0], dtype=np.float32)
+        else:
+            gr.Info("Number of right hand keypoints should be either 0 or 21.")
+            return None, None
+        if len(keypts[1]) == 0:
+            keypts[1] = np.zeros((21, 2))
+        elif len(keypts[1]) == 21:
+            keypts[1] = np.array(keypts[1], dtype=np.float32)
+        else:
+            gr.Info("Number of left hand keypoints should be either 0 or 21.")
+            return None, None
+        keypts = np.concatenate(keypts, axis=0)
+        # keypts = scale_keypoint(keypts, (LENGTH, LENGTH), opts.image_size)
+    if REF_POSE_MASK:
+        sam_predictor.set_image(img)
+        # l = keypts[:21].shape[0]
+        if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+            input_point = np.array([keypts[0], keypts[21]])
+            input_label = np.array([1, 1])
+        elif keypts[0].sum() != 0:
+            input_point = np.array(keypts[:1])
+            input_label = np.array([1])
+        elif keypts[21].sum() != 0:
+            input_point = np.array(keypts[21:22])
+            input_label = np.array([1])
+        masks, _, _ = sam_predictor.predict(
+            point_coords=input_point,
+            point_labels=input_label,
+            multimask_output=False,
+        )
+        hand_mask = masks[0]
+        masked_img = img * hand_mask[..., None] + 255 * (1 - hand_mask[..., None])
+        ref_pose = visualize_hand(keypts, masked_img)
+    else:
+        hand_mask = np.zeros_like(img[:,:, 0])
+        ref_pose = np.zeros_like(img)
+    def make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device="cuda",
+        target_size=(256, 256),
+        latent_size=(32, 32),
+    ):
+        image_transform = Compose(
+            [
+                ToTensor(),
+                Resize(target_size),
+                Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        image = image_transform(img).to(device)
+        kpts_valid = check_keypoints_validity(keypts, target_size)
+        heatmaps = torch.tensor(
+            keypoint_heatmap(
+                scale_keypoint(keypts, target_size, latent_size), latent_size, var=1.0
+            )
+            * kpts_valid[:, None, None],
+            dtype=torch.float,
+            device=device
+        )[None, ...]
+        mask = torch.tensor(
+            cv2.resize(
+                hand_mask.astype(int),
+                dsize=latent_size,
+                interpolation=cv2.INTER_NEAREST,
+            ),
+            dtype=torch.float,
+            device=device,
+        ).unsqueeze(0)[None, ...]
+        return image[None, ...], heatmaps, mask
+    print(f"img.max(): {img.max()}, img.min(): {img.min()}")
+    image, heatmaps, mask = make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device="cuda",
+        target_size=opts.image_size,
+        latent_size=opts.latent_size,
+    )
+    print(f"image.max(): {image.max()}, image.min(): {image.min()}")
+    print(f"opts.latent_scaling_factor: {opts.latent_scaling_factor}")
+    print(f"autoencoder encoder before operating max: {min([p.min() for p in autoencoder.encoder.parameters()])}")
+    print(f"autoencoder encoder before operating min: {max([p.max() for p in autoencoder.encoder.parameters()])}")
+    print(f"autoencoder encoder before operating dtype: {next(autoencoder.encoder.parameters()).dtype}")
+    latent = opts.latent_scaling_factor * autoencoder.encode(image).sample()
+    print(f"latent.max(): {latent.max()}, latent.min(): {latent.min()}")
+    if not REF_POSE_MASK:
+        heatmaps = torch.zeros_like(heatmaps)
+        mask = torch.zeros_like(mask)
+    print(f"heatmaps.max(): {heatmaps.max()}, heatmaps.min(): {heatmaps.min()}")
+    print(f"mask.max(): {mask.max()}, mask.min(): {mask.min()}")
+    ref_cond = torch.cat([latent, heatmaps, mask], 1)
+    print(f"ref_cond.max(): {ref_cond.max()}, ref_cond.min(): {ref_cond.min()}")
+    return img, ref_pose, ref_cond
+def get_target_anno(target):
+    if target is None:
+        return (
+            gr.State.update(value=None),
+            gr.Image.update(value=None),
+            gr.State.update(value=None),
+            gr.State.update(value=None),
+        )
+    pose_img = target["composite"][..., :3]
+    pose_img = cv2.resize(pose_img, opts.image_size, interpolation=cv2.INTER_AREA)
+    # detect keypoints
+    mp_pose = hands.process(pose_img)
+    target_keypts = np.zeros((42, 2))
+    detected = np.array([0, 0])
+    start_idx = 0
+    if mp_pose.multi_hand_landmarks:
+        # handedness is flipped assuming the input image is mirrored in MediaPipe
+        for hand_landmarks, handedness in zip(
+            mp_pose.multi_hand_landmarks, mp_pose.multi_handedness
+        ):
+            # actually right hand
+            if handedness.classification[0].label == "Left":
+                start_idx = 0
+                detected[0] = 1
+            # actually left hand
+            elif handedness.classification[0].label == "Right":
+                start_idx = 21
+                detected[1] = 1
+            for i, landmark in enumerate(hand_landmarks.landmark):
+                target_keypts[start_idx + i] = [
+                    landmark.x * opts.image_size[1],
+                    landmark.y * opts.image_size[0],
+                ]
+        target_pose = visualize_hand(target_keypts, pose_img)
+        kpts_valid = check_keypoints_validity(target_keypts, opts.image_size)
+        target_heatmaps = torch.tensor(
+            keypoint_heatmap(
+                scale_keypoint(target_keypts, opts.image_size, opts.latent_size),
+                opts.latent_size,
+                var=1.0,
+            )
+            * kpts_valid[:, None, None],
+            dtype=torch.float,
+            # device=device,
+        )[None, ...]
+        target_cond = torch.cat(
+            [target_heatmaps, torch.zeros_like(target_heatmaps)[:, :1]], 1
+        )
+    else:
+        raise gr.Error("No hands detected in the target image.")
+    return pose_img, target_pose, target_cond, target_keypts
+def get_mask_inpaint(ref):
+    inpaint_mask = np.array(ref["layers"][0])[..., -1]
+    inpaint_mask = cv2.resize(
+        inpaint_mask, opts.image_size, interpolation=cv2.INTER_AREA
+    )
+    inpaint_mask = (inpaint_mask >= 128).astype(np.uint8)
+    return inpaint_mask
+def visualize_ref(crop, brush):
+    if crop is None or brush is None:
+        return None
+    inpainted = brush["layers"][0][..., -1]
+    img = crop["background"][..., :3]
+    img = cv2.resize(img, inpainted.shape[::-1], interpolation=cv2.INTER_AREA)
+    mask = inpainted < 128
+    # img = img.astype(np.int32)
+    # img[mask, :] = img[mask, :] - 50
+    # img[np.any(img<0, axis=-1)]=0
+    # img = img.astype(np.uint8)
+    img = mask_image(img, mask)
+    return img
+def get_kps(img, keypoints, side: Literal["right", "left"], evt: gr.SelectData):
+    if keypoints is None:
+        keypoints = [[], []]
+    kps = np.zeros((42, 2))
+    if side == "right":
+        if len(keypoints[0]) == 21:
+            gr.Info("21 keypoints for right hand already selected. Try reset if something looks wrong.")
+        else:
+            keypoints[0].append(list(evt.index))
+        len_kps = len(keypoints[0])
+        kps[:len_kps] = np.array(keypoints[0])
+    elif side == "left":
+        if len(keypoints[1]) == 21:
+            gr.Info("21 keypoints for left hand already selected. Try reset if something looks wrong.")
+        else:
+            keypoints[1].append(list(evt.index))
+        len_kps = len(keypoints[1])
+        kps[21 : 21 + len_kps] = np.array(keypoints[1])
+    vis_hand = visualize_hand(kps, img, side, len_kps)
+    return vis_hand, keypoints
+def undo_kps(img, keypoints, side: Literal["right", "left"]):
+    if keypoints is None:
+        return img, None
+    kps = np.zeros((42, 2))
+    if side == "right":
+        if len(keypoints[0]) == 0:
+            return img, keypoints
+        keypoints[0].pop()
+        len_kps = len(keypoints[0])
+        kps[:len_kps] = np.array(keypoints[0])
+    elif side == "left":
+        if len(keypoints[1]) == 0:
+            return img, keypoints
+        keypoints[1].pop()
+        len_kps = len(keypoints[1])
+        kps[21 : 21 + len_kps] = np.array(keypoints[1])
+    vis_hand = visualize_hand(kps, img, side, len_kps)
+    return vis_hand, keypoints
+def reset_kps(img, keypoints, side: Literal["right", "left"]):
+    if keypoints is None:
+        return img, None
+    if side == "right":
+        keypoints[0] = []
+    elif side == "left":
+        keypoints[1] = []
+    return img, keypoints
+# @spaces.GPU(duration=60)
+def sample_diff(ref_cond, target_cond, target_keypts, num_gen, seed, cfg):
+    set_seed(seed)
+    z = torch.randn(
+        (num_gen, opts.latent_dim, opts.latent_size[0], opts.latent_size[1]),
+        device=device,
+    )
+    print(f"z.device: {z.device}")
+    target_cond = target_cond.repeat(num_gen, 1, 1, 1).to(z.device)
+    ref_cond = ref_cond.repeat(num_gen, 1, 1, 1).to(z.device)
+    print(f"target_cond.max(): {target_cond.max()}, target_cond.min(): {target_cond.min()}")
+    print(f"ref_cond.max(): {ref_cond.max()}, ref_cond.min(): {ref_cond.min()}")
+    # novel view synthesis mode = off
+    nvs = torch.zeros(num_gen, dtype=torch.int, device=device)
+    z = torch.cat([z, z], 0)
+    model_kwargs = dict(
+        target_cond=torch.cat([target_cond, torch.zeros_like(target_cond)]),
+        ref_cond=torch.cat([ref_cond, torch.zeros_like(ref_cond)]),
+        nvs=torch.cat([nvs, 2 * torch.ones_like(nvs)]),
+        cfg_scale=cfg,
+    )
+    samples, _ = diffusion.p_sample_loop(
+        model.forward_with_cfg,
+        z.shape,
+        z,
+        clip_denoised=False,
+        model_kwargs=model_kwargs,
+        progress=True,
+        device=device,
+    ).chunk(2)
+    sampled_images = autoencoder.decode(samples / opts.latent_scaling_factor)
+    sampled_images = torch.clamp(sampled_images, min=-1.0, max=1.0)
+    sampled_images = unnormalize(sampled_images.permute(0, 2, 3, 1).cpu().numpy())
+    results = []
+    results_pose = []
+    for i in range(MAX_N):
+        if i < num_gen:
+            results.append(sampled_images[i])
+            results_pose.append(visualize_hand(target_keypts, sampled_images[i]))
+        else:
+            results.append(placeholder)
+            results_pose.append(placeholder)
+    print(f"results[0].max(): {results[0].max()}")
+    return results, results_pose
+# @spaces.GPU(duration=120)
+def ready_sample(img_ori, inpaint_mask, keypts):
+    img = cv2.resize(img_ori[..., :3], opts.image_size, interpolation=cv2.INTER_AREA)
+    sam_predictor.set_image(img)
+    if len(keypts[0]) == 0:
+        keypts[0] = np.zeros((21, 2))
+    elif len(keypts[0]) == 21:
+        keypts[0] = np.array(keypts[0], dtype=np.float32)
+    else:
+        gr.Info("Number of right hand keypoints should be either 0 or 21.")
+        return None, None
+    if len(keypts[1]) == 0:
+        keypts[1] = np.zeros((21, 2))
+    elif len(keypts[1]) == 21:
+        keypts[1] = np.array(keypts[1], dtype=np.float32)
+    else:
+        gr.Info("Number of left hand keypoints should be either 0 or 21.")
+        return None, None
+    keypts = np.concatenate(keypts, axis=0)
+    keypts = scale_keypoint(keypts, (LENGTH, LENGTH), opts.image_size)
+    # if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+    #     input_point = np.array([keypts[0], keypts[21]])
+    #     # input_point = keypts
+    #     input_label = np.array([1, 1])
+    #     # input_label = np.ones_like(input_point[:, 0])
+    # elif keypts[0].sum() != 0:
+    #     input_point = np.array(keypts[:1])
+    #     # input_point = keypts[:21]
+    #     input_label = np.array([1])
+    #     # input_label = np.ones_like(input_point[:21, 0])
+    # elif keypts[21].sum() != 0:
+    #     input_point = np.array(keypts[21:22])
+    #     # input_point = keypts[21:]
+    #     input_label = np.array([1])
+    #     # input_label = np.ones_like(input_point[21:, 0])
+    box_shift_ratio = 0.5
+    box_size_factor = 1.2
+    if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+        input_point = np.array(keypts)
+        input_box = np.stack([keypts.min(axis=0), keypts.max(axis=0)])
+    elif keypts[0].sum() != 0:
+        input_point = np.array(keypts[:21])
+        input_box = np.stack([keypts[:21].min(axis=0), keypts[:21].max(axis=0)])
+    elif keypts[21].sum() != 0:
+        input_point = np.array(keypts[21:])
+        input_box = np.stack([keypts[21:].min(axis=0), keypts[21:].max(axis=0)])
+    else:
+        raise ValueError(
+            "Something wrong. If no hand detected, it should not reach here."
+        )
+    input_label = np.ones_like(input_point[:, 0]).astype(np.int32)
+    box_trans = input_box[0] * box_shift_ratio + input_box[1] * (1 - box_shift_ratio)
+    input_box = ((input_box - box_trans) * box_size_factor + box_trans).reshape(-1)
+    masks, _, _ = sam_predictor.predict(
+        point_coords=input_point,
+        point_labels=input_label,
+        box=input_box[None, :],
+        multimask_output=False,
+    )
+    hand_mask = masks[0]
+    inpaint_latent_mask = torch.tensor(
+        cv2.resize(
+            inpaint_mask, dsize=opts.latent_size, interpolation=cv2.INTER_NEAREST
+        ),
+        dtype=torch.float,
+        # device=device,
+    ).unsqueeze(0)[None, ...]
+    def make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device=device,
+        target_size=(256, 256),
+        latent_size=(32, 32),
+    ):
+        image_transform = Compose(
+            [
+                ToTensor(),
+                Resize(target_size),
+                Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        image = image_transform(img)
+        kpts_valid = check_keypoints_validity(keypts, target_size)
+        heatmaps = torch.tensor(
+            keypoint_heatmap(
+                scale_keypoint(keypts, target_size, latent_size), latent_size, var=1.0
+            )
+            * kpts_valid[:, None, None],
+            dtype=torch.float,
+            # device=device,
+        )[None, ...]
+        mask = torch.tensor(
+            cv2.resize(
+                hand_mask.astype(int),
+                dsize=latent_size,
+                interpolation=cv2.INTER_NEAREST,
+            ),
+            dtype=torch.float,
+            # device=device,
+        ).unsqueeze(0)[None, ...]
+        return image[None, ...], heatmaps, mask
+    image, heatmaps, mask = make_ref_cond(
+        img,
+        keypts,
+        hand_mask * (1 - inpaint_mask),
+        device=device,
+        target_size=opts.image_size,
+        latent_size=opts.latent_size,
+    )
+    latent = opts.latent_scaling_factor * autoencoder.encode(image).sample()
+    target_cond = torch.cat([heatmaps, torch.zeros_like(mask)], 1)
+    ref_cond = torch.cat([latent, heatmaps, mask], 1)
+    ref_cond = torch.zeros_like(ref_cond)
+    img32 = cv2.resize(img, opts.latent_size, interpolation=cv2.INTER_NEAREST)
+    assert mask.max() == 1
+    vis_mask32 = mask_image(
+        img32, inpaint_latent_mask[0,0].cpu().numpy(), (255,255,255), transparent=False
+    ).astype(np.uint8) # 1.0 - mask[0, 0].cpu().numpy()
+    assert np.unique(inpaint_mask).shape[0] <= 2
+    assert hand_mask.dtype == bool
+    mask256 = inpaint_mask # hand_mask * (1 - inpaint_mask)
+    vis_mask256 = mask_image(img, mask256, (255,255,255), transparent=False).astype(
+        np.uint8
+    ) # 1 - mask256
+    return (
+        ref_cond,
+        target_cond,
+        latent,
+        inpaint_latent_mask,
+        keypts,
+        vis_mask32,
+        vis_mask256,
+    )
+def switch_mask_size(radio):
+    if radio == "256x256":
+        out = (gr.update(visible=False), gr.update(visible=True))
+    elif radio == "latent size (32x32)":
+        out = (gr.update(visible=True), gr.update(visible=False))
+    return out
+# @spaces.GPU(duration=300)
+def sample_inpaint(
+    ref_cond,
+    target_cond,
+    latent,
+    inpaint_latent_mask,
+    keypts,
+    num_gen,
+    seed,
+    cfg,
+    quality,
+):
+    set_seed(seed)
+    N = num_gen
+    jump_length = 10
+    jump_n_sample = quality
+    cfg_scale = cfg
+    z = torch.randn(
+        (N, opts.latent_dim, opts.latent_size[0], opts.latent_size[1]), device=device
+    )
+    target_cond_N = target_cond.repeat(N, 1, 1, 1).to(z.device)
+    ref_cond_N = ref_cond.repeat(N, 1, 1, 1).to(z.device)
+    # novel view synthesis mode = off
+    nvs = torch.zeros(N, dtype=torch.int, device=device)
+    z = torch.cat([z, z], 0)
+    model_kwargs = dict(
+        target_cond=torch.cat([target_cond_N, torch.zeros_like(target_cond_N)]),
+        ref_cond=torch.cat([ref_cond_N, torch.zeros_like(ref_cond_N)]),
+        nvs=torch.cat([nvs, 2 * torch.ones_like(nvs)]),
+        cfg_scale=cfg_scale,
+    )
+    samples, _ = diffusion.inpaint_p_sample_loop(
+        model.forward_with_cfg,
+        z.shape,
+        latent.to(z.device),
+        inpaint_latent_mask.to(z.device),
+        z,
+        clip_denoised=False,
+        model_kwargs=model_kwargs,
+        progress=True,
+        device=z.device,
+        jump_length=jump_length,
+        jump_n_sample=jump_n_sample,
+    ).chunk(2)
+    sampled_images = autoencoder.decode(samples / opts.latent_scaling_factor)
+    sampled_images = torch.clamp(sampled_images, min=-1.0, max=1.0)
+    sampled_images = unnormalize(sampled_images.permute(0, 2, 3, 1).cpu().numpy())
+    # visualize
+    results = []
+    results_pose = []
+    for i in range(FIX_MAX_N):
+        if i < num_gen:
+            results.append(sampled_images[i])
+            results_pose.append(visualize_hand(keypts, sampled_images[i]))
+        else:
+            results.append(placeholder)
+            results_pose.append(placeholder)
+    return results, results_pose
+def flip_hand(
+    img, pose_img, cond: Optional[torch.Tensor], keypts: Optional[torch.Tensor] = None, pose_manual_img = None,
+    manual_kp_right=None, manual_kp_left=None
+):
+    if cond is None:  # clear clicked
+        return None, None, None, None
+    img["composite"] = img["composite"][:, ::-1, :]
+    img["background"] = img["background"][:, ::-1, :]
+    img["layers"] = [layer[:, ::-1, :] for layer in img["layers"]]
+    pose_img = pose_img[:, ::-1, :]
+    cond = cond.flip(-1)
+    if keypts is not None:  # cond is target_cond
+        if keypts[:21, :].sum() != 0:
+            keypts[:21, 0] = opts.image_size[1] - keypts[:21, 0]
+            # keypts[:21, 1] = opts.image_size[0] - keypts[:21, 1]
+        if keypts[21:, :].sum() != 0:
+            keypts[21:, 0] = opts.image_size[1] - keypts[21:, 0]
+            # keypts[21:, 1] = opts.image_size[0] - keypts[21:, 1]
+    if pose_manual_img is not None:
+        pose_manual_img = pose_manual_img[:, ::-1, :]
+        manual_kp_right = manual_kp_right[:, ::-1, :]
+        manual_kp_left = manual_kp_left[:, ::-1, :]
+    return img, pose_img, cond, keypts, pose_manual_img, manual_kp_right, manual_kp_left
+def resize_to_full(img):
+    img["background"] = cv2.resize(img["background"], (LENGTH, LENGTH))
+    img["composite"] = cv2.resize(img["composite"], (LENGTH, LENGTH))
+    img["layers"] = [cv2.resize(layer, (LENGTH, LENGTH)) for layer in img["layers"]]
+    return img
+def clear_all():
+    return (
+        None,
+        None,
+        None,
+        None,
+        None,
+        False,
+        None,
+        None,
+        False,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        1,
+        42,
+        3.0,
+        gr.update(interactive=False),
+        []
+    )
+def fix_clear_all():
+    return (
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        1,
+        # (0,0),
+        42,
+        3.0,
+        10,
+    )
+def enable_component(image1, image2):
+    if image1 is None or image2 is None:
+        return gr.update(interactive=False)
+    if "background" in image1 and "layers" in image1 and "composite" in image1:
+        if (
+            image1["background"].sum() == 0
+            and (sum([im.sum() for im in image1["layers"]]) == 0)
+            and image1["composite"].sum() == 0
+        ):
+            return gr.update(interactive=False)
+    if "background" in image2 and "layers" in image2 and "composite" in image2:
+        if (
+            image2["background"].sum() == 0
+            and (sum([im.sum() for im in image2["layers"]]) == 0)
+            and image2["composite"].sum() == 0
+        ):
+            return gr.update(interactive=False)
+    return gr.update(interactive=True)
+def set_visible(checkbox, kpts, img_clean, img_pose_right, img_pose_left, done=None, done_info=None):
+    if kpts is None:
+        kpts = [[], []]
+    if "Right hand" not in checkbox:
+        kpts[0] = []
+        vis_right = img_clean
+        update_right = gr.update(visible=False)
+        update_r_info = gr.update(visible=False)
+    else:
+        vis_right = img_pose_right
+        update_right = gr.update(visible=True)
+        update_r_info = gr.update(visible=True)
+    if "Left hand" not in checkbox:
+        kpts[1] = []
+        vis_left = img_clean
+        update_left = gr.update(visible=False)
+        update_l_info = gr.update(visible=False)
+    else:
+        vis_left = img_pose_left
+        update_left = gr.update(visible=True)
+        update_l_info = gr.update(visible=True)
+    ret = [
+        kpts,
+        vis_right,
+        vis_left,
+        update_right,
+        update_right,
+        update_right,
+        update_left,
+        update_left,
+        update_left,
+        update_r_info,
+        update_l_info,
+    ]
+    if done is not None:
+        if not checkbox:
+            ret.append(gr.update(visible=False))
+            ret.append(gr.update(visible=False))
+        else:
+            ret.append(gr.update(visible=True))
+            ret.append(gr.update(visible=True))
+    return tuple(ret)
+def set_unvisible():
+    return (
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False)
+    )
+def set_no_hands(decider, component):
+    if decider is None:
+        no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH))
+        return no_hands
+    else:
+        return component
+# def visible_component(decider, component):
+#     if decider is not None:
+#         update_component = gr.update(visible=True)
+#     else:
+#         update_component = gr.update(visible=False)
+#     return update_component
+def unvisible_component(decider, component):
+    if decider is not None:
+        update_component = gr.update(visible=False)
+    else:
+        update_component = gr.update(visible=True)
+    return update_component
+def make_change(decider, state):
+    '''
+    if decider is not None, change the state's value. True/False does not matter.
+    '''
+    if decider is not None:
+        if state:
+            state = False
+        else:
+            state = True
+        return state
+    else:
+        return state
+LENGTH = 480
+example_ref_imgs = [
+    [
+        "sample_images/sample1.jpg",
+    ],
+    [
+        "sample_images/sample2.jpg",
+    ],
+    [
+        "sample_images/sample3.jpg",
+    ],
+    [
+        "sample_images/sample4.jpg",
+    ],
+    # [
+    #     "sample_images/sample5.jpg",
+    # ],
+    [
+        "sample_images/sample6.jpg",
+    ],
+    # [
+    #     "sample_images/sample7.jpg",
+    # ],
+    # [
+    #     "sample_images/sample8.jpg",
+    # ],
+    # [
+    #     "sample_images/sample9.jpg",
+    # ],
+    # [
+    #     "sample_images/sample10.jpg",
+    # ],
+    # [
+    #     "sample_images/sample11.jpg",
+    # ],
+    # ["pose_images/pose1.jpg"],
+    # ["pose_images/pose2.jpg"],
+    # ["pose_images/pose3.jpg"],
+    # ["pose_images/pose4.jpg"],
+    # ["pose_images/pose5.jpg"],
+    # ["pose_images/pose6.jpg"],
+    # ["pose_images/pose7.jpg"],
+    # ["pose_images/pose8.jpg"],
+]
+example_target_imgs = [
+    # [
+    #     "sample_images/sample1.jpg",
+    # ],
+    # [
+    #     "sample_images/sample2.jpg",
+    # ],
+    # [
+    #     "sample_images/sample3.jpg",
+    # ],
+    # [
+    #     "sample_images/sample4.jpg",
+    # ],
+    [
+        "sample_images/sample5.jpg",
+    ],
+    # [
+        # "sample_images/sample6.jpg",
+    # ],
+    # [
+    #     "sample_images/sample7.jpg",
+    # ],
+    # [
+    #     "sample_images/sample8.jpg",
+    # ],
+    [
+        "sample_images/sample9.jpg",
+    ],
+    [
+        "sample_images/sample10.jpg",
+    ],
+    [
+        "sample_images/sample11.jpg",
+    ],
+    ["pose_images/pose1.jpg"],
+    # ["pose_images/pose2.jpg"],
+    # ["pose_images/pose3.jpg"],
+    # ["pose_images/pose4.jpg"],
+    # ["pose_images/pose5.jpg"],
+    # ["pose_images/pose6.jpg"],
+    # ["pose_images/pose7.jpg"],
+    # ["pose_images/pose8.jpg"],
+]
+fix_example_imgs = [
+    ["bad_hands/1.jpg"],  # "bad_hands/1_mask.jpg"],
+    # ["bad_hands/2.jpg"],  # "bad_hands/2_mask.jpg"],
+    ["bad_hands/3.jpg"],  # "bad_hands/3_mask.jpg"],
+    # ["bad_hands/4.jpg"],  # "bad_hands/4_mask.jpg"],
+    ["bad_hands/5.jpg"],  # "bad_hands/5_mask.jpg"],
+    ["bad_hands/6.jpg"],  # "bad_hands/6_mask.jpg"],
+    ["bad_hands/7.jpg"],  # "bad_hands/7_mask.jpg"],
+    # ["bad_hands/8.jpg"],  # "bad_hands/8_mask.jpg"],
+    # ["bad_hands/9.jpg"],  # "bad_hands/9_mask.jpg"],
+    # ["bad_hands/10.jpg"],  # "bad_hands/10_mask.jpg"],
+    # ["bad_hands/11.jpg"],  # "bad_hands/11_mask.jpg"],
+    # ["bad_hands/12.jpg"],  # "bad_hands/12_mask.jpg"],
+    # ["bad_hands/13.jpg"],  # "bad_hands/13_mask.jpg"],
+    ["bad_hands/14.jpg"],
+    ["bad_hands/15.jpg"],
+]
+custom_css = """
+.gradio-container .examples img {
+    width: 240px !important;
+    height: 240px !important;
+}
+"""
+_HEADER_ = '''
+<div style="text-align: center;">
+    <h1><b>FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation</b></h1>
+    <h2 style="color: #777777;">CVPR 2025</h2>
+    <style>
+        .link-spacing {
+            margin-right: 20px;
+        }
+    </style>
+    <p style="font-size: 15px;">
+        <span style="display: inline-block; margin-right: 30px;">Brown University</span>
+        <span style="display: inline-block;">Meta Reality Labs</span>
+    </p>
+    <h3>
+        <a href='https://arxiv.org/abs/2412.02690' target='_blank' class="link-spacing">Paper</a>
+        <a href='https://ivl.cs.brown.edu/research/foundhand.html' target='_blank' class="link-spacing">Project Page</a>
+        <a href='' target='_blank' class="link-spacing">Code</a>
+        <a href='' target='_blank'>Model Weights</a>
+    </h3>
+    <p>Below are two important abilities of our model. First, we can <b>edit hand poses</b> given two hand images - one is the image to edit, and the other one provides target hand pose. Second, we can automatically <b>fix malformed hand images</b>, following the user-provided target hand pose and area to fix.</p>
+</div>
+'''
+_CITE_ = r"""
+```
+    @article{chen2024foundhand,
+    title={FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation},
+    author={Chen, Kefan and Min, Chaerin and Zhang, Linguang and Hampali, Shreyas and Keskin, Cem and Sridhar, Srinath},
+    journal={arXiv preprint arXiv:2412.02690},
+    year={2024}
+    }
+```
+"""
+with gr.Blocks(css=custom_css, theme="soft") as demo:
+    gr.Markdown(_HEADER_)
+    with gr.Tab("Edit Hand Poses"):
+        ref_img = gr.State(value=None)
+        ref_im_raw = gr.State(value=None)
+        ref_kp_raw = gr.State(value=0)
+        ref_kp_got = gr.State(value=None)
+        dump = gr.State(value=None)
+        ref_cond = gr.State(value=None)
+        ref_manual_cond = gr.State(value=None)
+        ref_auto_cond = gr.State(value=None)
+        keypts = gr.State(value=None)
+        target_img = gr.State(value=None)
+        target_cond = gr.State(value=None)
+        target_keypts = gr.State(value=None)
+        dump = gr.State(value=None)
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold;">1. Upload a hand image to edit 📥</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9312; Optionally crop the image</p>"""
+                )
+                # gr.Markdown("""<p style="text-align: center;"><br></p>""")
+                ref = gr.ImageEditor(
+                    type="numpy",
+                    label="Reference",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    brush=False,
+                    layers=False,
+                    crop_size="1:1",
+                )
+                gr.Examples(example_ref_imgs, [ref], examples_per_page=20)
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9313; Hit the &quot;Finish Cropping&quot; button to get hand pose</p>"""
+                )
+                ref_finish_crop = gr.Button(value="Finish Cropping", interactive=False)
+                with gr.Tab("Automatic hand keypoints"):
+                    ref_pose = gr.Image(
+                        type="numpy",
+                        label="Reference Pose",
+                        show_label=True,
+                        height=LENGTH,
+                        width=LENGTH,
+                        interactive=False,
+                    )
+                    ref_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True)
+                with gr.Tab("Manual hand keypoints"):
+                    ref_manual_checkbox_info = gr.Markdown(
+                        """<p style="text-align: center;"><b>Step 1.</b> Tell us if this is right, left, or both hands.</p>""",
+                        visible=True,
+                    )
+                    ref_manual_checkbox = gr.CheckboxGroup(
+                        ["Right hand", "Left hand"],
+                        # label="Hand side",
+                        # info="Hand pose failed to automatically detected. Now let's enable user-provided hand pose. First of all, please tell us if this is right, left, or both hands",
+                        show_label=False,
+                        visible=True,
+                        interactive=True,
+                    )
+                    ref_manual_kp_r_info = gr.Markdown(
+                        """<p style="text-align: center;"><b>Step 2.</b> Click on image to provide hand keypoints for <b>right</b> hand. See \"OpenPose Keypoint Convention\" for guidance.</p>""",
+                        visible=False,
+                    )
+                    ref_manual_kp_right = gr.Image(
+                        type="numpy",
+                        label="Keypoint Selection (right hand)",
+                        show_label=True,
+                        height=LENGTH,
+                        width=LENGTH,
+                        interactive=False,
+                        visible=False,
+                        sources=[],
+                    )
+                    with gr.Row():
+                        ref_manual_undo_right = gr.Button(
+                            value="Undo", interactive=True, visible=False
+                        )
+                        ref_manual_reset_right = gr.Button(
+                            value="Reset", interactive=True, visible=False
+                        )
+                    ref_manual_kp_l_info = gr.Markdown(
+                        """<p style="text-align: center;"><b>Step 2.</b> Click on image to provide hand keypoints for <b>left</b> hand. See \"OpenPose keypoint convention\" for guidance.</p>""",
+                        visible=False
+                    )
+                    ref_manual_kp_left = gr.Image(
+                        type="numpy",
+                        label="Keypoint Selection (left hand)",
+                        show_label=True,
+                        height=LENGTH,
+                        width=LENGTH,
+                        interactive=False,
+                        visible=False,
+                        sources=[],
+                    )
+                    with gr.Row():
+                        ref_manual_undo_left = gr.Button(
+                            value="Undo", interactive=True, visible=False
+                        )
+                        ref_manual_reset_left = gr.Button(
+                            value="Reset", interactive=True, visible=False
+                        )
+                    ref_manual_done_info = gr.Markdown(
+                        """<p style="text-align: center;"><b>Step 3.</b> Hit \"Done\" button to confirm.</p>""",
+                        visible=False,
+                    )
+                    ref_manual_done = gr.Button(value="Done", interactive=True, visible=False)
+                    ref_manual_pose = gr.Image(
+                        type="numpy",
+                        label="Reference Pose",
+                        show_label=True,
+                        height=LENGTH,
+                        width=LENGTH,
+                        interactive=False,
+                        visible=False
+                    )
+                    ref_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False)
+                    ref_manual_instruct = gr.Markdown(
+                        value="""<p style="text-align: left; font-weight: bold; ">OpenPose Keypoints Convention</p>""",
+                        visible=True
+                    )
+                    ref_manual_openpose = gr.Image(
+                        value="openpose.png",
+                        type="numpy",
+                        # label="OpenPose keypoints convention",
+                        show_label=False,
+                        height=LENGTH // 2,
+                        width=LENGTH // 2,
+                        interactive=False,
+                        visible=True
+                    )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9314; Optionally flip the hand</p>"""
+                )
+                ref_flip = gr.Checkbox(
+                    value=False, label="Flip Handedness (Reference)", interactive=False
+                )
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold;">2. Upload a hand image for target hand pose 📥</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9312; Optionally crop the image</p>"""
+                )
+                target = gr.ImageEditor(
+                    type="numpy",
+                    label="Target",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    brush=False,
+                    layers=False,
+                    crop_size="1:1",
+                )
+                gr.Examples(example_target_imgs, [target], examples_per_page=20)
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9313; Hit the &quot;Finish Cropping&quot; button to get hand pose</p>"""
+                )
+                target_finish_crop = gr.Button(
+                    value="Finish Cropping", interactive=False
+                )
+                target_pose = gr.Image(
+                    type="numpy",
+                    label="Target Pose",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    interactive=False,
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9314; Optionally flip the hand</p>"""
+                )
+                target_flip = gr.Checkbox(
+                    value=False, label="Flip Handedness (Target)", interactive=False
+                )
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold;">3. Press &quot;Run&quot; to get the edited results 🎯</p>"""
+                )
+                # gr.Markdown(
+                #     """<p style="text-align: center;">[NOTE] Run will be enabled after the previous steps have been completed</p>"""
+                # )
+                run = gr.Button(value="Run", interactive=False)
+                gr.Markdown(
+                    """<p style="text-align: center;">⚠️ ~20s per generation with RTX3090. ~50s with A100. <br>(For example, if you set Number of generations as 2, it would take around 40s)</p>"""
+                )
+                results = gr.Gallery(
+                    type="numpy",
+                    label="Results",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                results_pose = gr.Gallery(
+                    type="numpy",
+                    label="Results Pose",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">✨ Hit &quot;Clear&quot; to restart from the beginning</p>"""
+                )
+                clear = gr.ClearButton()
+        # gr.Markdown(
+        #     """<p style="text-align: left; font-size: 25px;"><b>More options</b></p>"""
+        # )
+        with gr.Tab("More options"):
+            with gr.Row():
+                n_generation = gr.Slider(
+                    label="Number of generations",
+                    value=1,
+                    minimum=1,
+                    maximum=MAX_N,
+                    step=1,
+                    randomize=False,
+                    interactive=True,
+                )
+                seed = gr.Slider(
+                    label="Seed",
+                    value=42,
+                    minimum=0,
+                    maximum=10000,
+                    step=1,
+                    randomize=False,
+                    interactive=True,
+                )
+                cfg = gr.Slider(
+                    label="Classifier free guidance scale",
+                    value=2.5,
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    randomize=False,
+                    interactive=True,
+                )
+        ref.change(enable_component, [ref, ref], ref_finish_crop)
+        # ref_finish_crop.click(get_ref_anno, [ref], [ref_img, ref_pose, ref_cond])
+        ref_finish_crop.click(prepare_ref_anno, [ref], [ref_im_raw, ref_kp_raw])
+        # ref_kp_raw.change(make_change, [ref_kp_raw, ref_kp_watcher], ref_kp_watcher)
+        # ref_kp_raw.change(set_no_hands, [ref_kp_raw, ref_pose], ref_pose)
+        ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_right)
+        ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_left)
+        # ref_kp_raw.change(unvisible_component, [ref_kp_raw, ref_manual_checkbox], ref_manual_checkbox)
+        # ref_kp_raw.change(unvisible_component, [ref_kp_raw, ref_manual_checkbox_info], ref_manual_checkbox_info)
+        # ref_kp_raw.change(unvisible_component, [ref_kp_raw, ref_manual_openpose], ref_manual_openpose)
+        # ref_kp_raw.change(unvisible_component, [ref_kp_raw, ref_manual_instruct], ref_manual_instruct)
+        # ref_kp_raw.change(lambda x: x, ref_kp_raw, ref_kp_got)
+        ref_manual_checkbox.select(
+            set_visible,
+            [ref_manual_checkbox, ref_kp_got, ref_im_raw, ref_manual_kp_right, ref_manual_kp_left, ref_manual_done],
+            [
+                ref_kp_got,
+                ref_manual_kp_right,
+                ref_manual_kp_left,
+                ref_manual_kp_right,
+                ref_manual_undo_right,
+                ref_manual_reset_right,
+                ref_manual_kp_left,
+                ref_manual_undo_left,
+                ref_manual_reset_left,
+                ref_manual_kp_r_info,
+                ref_manual_kp_l_info,
+                ref_manual_done,
+                ref_manual_done_info
+            ]
+        )
+        ref_manual_kp_right.select(
+            get_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got]
+        )
+        ref_manual_undo_right.click(
+            undo_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got]
+        )
+        ref_manual_reset_right.click(
+            reset_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got]
+        )
+        ref_manual_kp_left.select(
+            get_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got]
+        )
+        ref_manual_undo_left.click(
+            undo_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got]
+        )
+        ref_manual_reset_left.click(
+            reset_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got]
+        )
+        # ref_manual_done.click(lambda x: ~x, ref_kp_watcher, ref_kp_watcher)
+        ref_manual_done.click(get_ref_anno, [ref_im_raw, ref_kp_got], [ref_img, ref_manual_pose, ref_manual_cond])
+        ref_manual_cond.change(lambda x: x, ref_manual_cond, ref_cond)
+        ref_use_manual.click(lambda x: x, ref_manual_cond, ref_cond)
+        ref_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3))
+        ref_manual_done.click(lambda x: gr.update(visible=True), ref_manual_pose, ref_manual_pose)
+        ref_manual_done.click(lambda x: gr.update(visible=True), ref_use_manual, ref_use_manual)
+        ref_manual_pose.change(enable_component, [ref_manual_pose, ref_manual_pose], ref_manual_done)
+        # ref_pose.change(enable_component, [ref_pose, gr.State(value=True)], ref_ok)
+        ref_kp_raw.change(get_ref_anno, [ref_im_raw, ref_kp_raw], [ref_img, ref_pose, ref_auto_cond])
+        ref_auto_cond.change(lambda x: x, ref_auto_cond, ref_cond)
+        ref_use_auto.click(lambda x: x, ref_auto_cond, ref_cond)
+        ref_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Reference'", duration=3))
+        ref_pose.change(enable_component, [ref_kp_raw, ref_pose], ref_use_auto)
+        ref_pose.change(enable_component, [ref_img, ref_pose], ref_flip)
+        ref_manual_pose.change(enable_component, [ref_img, ref_manual_pose], ref_flip)
+        ref_flip.select(
+            flip_hand, [ref, ref_pose, ref_cond, gr.State(value=None), ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left], [ref, ref_pose, ref_cond, dump, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left]
+        )
+        target.change(enable_component, [target, target], target_finish_crop)
+        target_finish_crop.click(
+            get_target_anno,
+            [target],
+            [target_img, target_pose, target_cond, target_keypts],
+        )
+        target_pose.change(enable_component, [target_img, target_pose], target_flip)
+        target_flip.select(
+            flip_hand,
+            [target, target_pose, target_cond, target_keypts],
+            [target, target_pose, target_cond, target_keypts],
+        )
+        ref_pose.change(enable_component, [ref_pose, target_pose], run)
+        ref_manual_pose.change(enable_component, [ref_manual_pose, target_pose], run)
+        target_pose.change(enable_component, [ref_pose, target_pose], run)
+        run.click(
+            sample_diff,
+            [ref_cond, target_cond, target_keypts, n_generation, seed, cfg],
+            [results, results_pose],
+        )
+        clear.click(
+            clear_all,
+            [],
+            [
+                ref,
+                ref_manual_kp_right,
+                ref_manual_kp_left,
+                ref_pose,
+                ref_manual_pose,
+                ref_flip,
+                target,
+                target_pose,
+                target_flip,
+                results,
+                results_pose,
+                ref_img,
+                ref_cond,
+                # mask,
+                target_img,
+                target_cond,
+                target_keypts,
+                n_generation,
+                seed,
+                cfg,
+                ref_kp_raw,
+                ref_manual_checkbox
+            ],
+        )
+        clear.click(
+            set_unvisible,
+            [],
+            [
+                # ref_manual_checkbox,
+                # ref_manual_instruct,
+                # ref_manual_openpose,
+                ref_manual_kp_r_info,
+                ref_manual_kp_l_info,
+                ref_manual_undo_left,
+                ref_manual_undo_right,
+                ref_manual_reset_left,
+                ref_manual_reset_right,
+                ref_manual_done,
+                ref_manual_done_info,
+                ref_manual_pose,
+                ref_use_manual,
+                ref_manual_kp_right,
+                ref_manual_kp_left
+            ]
+        )
+        # gr.Markdown("""<p style="font-size: 25px; font-weight: bold;">Examples</p>""")
+        # with gr.Tab("Reference"):
+            # with gr.Row():
+                # gr.Examples(example_imgs, [ref], examples_per_page=20)
+        # with gr.Tab("Target"):
+            # with gr.Row():
+                # gr.Examples(example_imgs, [target], examples_per_page=20)
+    with gr.Tab("Fix Hands"):
+        fix_inpaint_mask = gr.State(value=None)
+        fix_original = gr.State(value=None)
+        fix_img = gr.State(value=None)
+        fix_kpts = gr.State(value=None)
+        fix_kpts_np = gr.State(value=None)
+        fix_ref_cond = gr.State(value=None)
+        fix_target_cond = gr.State(value=None)
+        fix_latent = gr.State(value=None)
+        fix_inpaint_latent = gr.State(value=None)
+        # fix_size_memory = gr.State(value=(0, 0))
+        # gr.Markdown("""<p style="text-align: center; font-size: 25px; font-weight: bold; ">⚠️ Note</p>""")
+        # gr.Markdown("""<p>"Fix Hands" with A100 needs around 6 mins, which is beyond the ZeroGPU quota (5 mins). Please either purchase additional gpus from Hugging Face or wait for us to open-source our code soon so that you can use your own gpus🙏 </p>""")
+        with gr.Row():
+            with gr.Column():
+                # gr.Markdown(
+                #     """<p style="text-align: center; font-size: 25px; font-weight: bold; ">1. Image Cropping & Brushing</p>"""
+                # )
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold;">1. Upload a malformed hand image to fix 📥</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9312; Optionally crop the image around the hand</p>"""
+                )
+                # gr.Markdown(
+                #     """<p style="text-align: center; font-size: 20px; font-weight: bold; ">A. Crop</p>"""
+                # )
+                fix_crop = gr.ImageEditor(
+                    type="numpy",
+                    sources=["upload", "webcam", "clipboard"],
+                    label="Image crop",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    layers=False,
+                    crop_size="1:1",
+                    brush=False,
+                    image_mode="RGBA",
+                    container=False,
+                )
+                fix_example = gr.Examples(
+                    fix_example_imgs,
+                    inputs=[fix_crop],
+                    examples_per_page=20,
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9313; Brush area (e.g., wrong finger) that needs to be fixed. This will serve as an inpaint mask</p>"""
+                )
+                # gr.Markdown(
+                #     """<p style="text-align: center; font-size: 20px; font-weight: bold; ">B. Brush</p>"""
+                # )
+                fix_ref = gr.ImageEditor(
+                    type="numpy",
+                    label="Image brush",
+                    sources=(),
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    layers=False,
+                    transforms=("brush"),
+                    brush=gr.Brush(
+                        colors=["rgb(255, 255, 255)"], default_size=20
+                    ),  # 204, 50, 50
+                    image_mode="RGBA",
+                    container=False,
+                    interactive=False,
+                )
+                fix_finish_crop = gr.Button(
+                    value="Finish Croping & Brushing", interactive=False
+                )
+            with gr.Column():
+                # gr.Markdown(
+                #     """<p style="text-align: center; font-size: 25px; font-weight: bold; ">2. Keypoint Selection</p>"""
+                # )
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold;">2. Click on hand to get target hand pose</p>"""
+                )
+                # gr.Markdown(
+                #     """<p style="text-align: center;">On the hand, select 21 keypoints that you hope the output to be. <br>Please see the \"OpenPose keypoints convention\"</p>"""
+                # )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9312; Tell us if this is right, left, or both hands</p>"""
+                )
+                fix_checkbox = gr.CheckboxGroup(
+                    ["Right hand", "Left hand"],
+                    # value=["Right hand", "Left hand"],
+                    # label="Hand side",
+                    # info="Which side this hand is? Could be both.",
+                    show_label=False,
+                    interactive=False,
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">&#9313; On the image, click 21 hand keypoints. This will serve as target hand poses. See the \"OpenPose keypoints convention\" for guidance.</p>"""
+                )
+                fix_kp_r_info = gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold; ">Select right only</p>""",
+                    visible=False,
+                )
+                fix_kp_right = gr.Image(
+                    type="numpy",
+                    label="Keypoint Selection (right hand)",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    interactive=False,
+                    visible=False,
+                    sources=[],
+                )
+                with gr.Row():
+                    fix_undo_right = gr.Button(
+                        value="Undo", interactive=False, visible=False
+                    )
+                    fix_reset_right = gr.Button(
+                        value="Reset", interactive=False, visible=False
+                    )
+                fix_kp_l_info = gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold; ">Select left only</p>""",
+                    visible=False
+                )
+                fix_kp_left = gr.Image(
+                    type="numpy",
+                    label="Keypoint Selection (left hand)",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    interactive=False,
+                    visible=False,
+                    sources=[],
+                )
+                with gr.Row():
+                    fix_undo_left = gr.Button(
+                        value="Undo", interactive=False, visible=False
+                    )
+                    fix_reset_left = gr.Button(
+                        value="Reset", interactive=False, visible=False
+                    )
+                gr.Markdown(
+                    """<p style="text-align: left; font-weight: bold; ">OpenPose keypoints convention</p>"""
+                )
+                fix_openpose = gr.Image(
+                    value="openpose.png",
+                    type="numpy",
+                    # label="OpenPose keypoints convention",
+                    show_label=False,
+                    height=LENGTH // 2,
+                    width=LENGTH // 2,
+                    interactive=False,
+                )
+            with gr.Column():
+                # gr.Markdown(
+                #     """<p style="text-align: center; font-size: 25px; font-weight: bold; ">3. Prepare Mask</p>"""
+                # )
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold;">3. Press &quot;Ready&quot; to start pre-processing</p>"""
+                )
+                fix_ready = gr.Button(value="Ready", interactive=False)
+                # fix_mask_size = gr.Radio(
+                #     ["256x256", "latent size (32x32)"],
+                #     label="Visualized inpaint mask size",
+                #     interactive=False,
+                #     value="256x256",
+                # )
+                gr.Markdown(
+                    """<p style="text-align: center; font-weight: bold; ">Visualized (256, 256) Inpaint Mask</p>"""
+                )
+                fix_vis_mask32 = gr.Image(
+                    type="numpy",
+                    label=f"Visualized {opts.latent_size} Inpaint Mask",
+                    show_label=True,
+                    height=opts.latent_size,
+                    width=opts.latent_size,
+                    interactive=False,
+                    visible=False,
+                )
+                fix_vis_mask256 = gr.Image(
+                    type="numpy",
+                    # label=f"Visualized {opts.image_size} Inpaint Mask",
+                    visible=True,
+                    show_label=False,
+                    height=opts.image_size,
+                    width=opts.image_size,
+                    interactive=False,
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">[NOTE] Above should be inpaint mask that you brushed, NOT the segmentation mask of the entire hand. </p>"""
+                )
+            with gr.Column():
+                # gr.Markdown(
+                #     """<p style="text-align: center; font-size: 25px; font-weight: bold; ">4. Results</p>"""
+                # )
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold;">4. Press &quot;Run&quot; to get the fixed hand image 🎯</p>"""
+                )
+                fix_run = gr.Button(value="Run", interactive=False)
+                gr.Markdown(
+                    """<p style="text-align: center;">⚠️  >3min and ~24GB per generation</p>"""
+                )
+                fix_result = gr.Gallery(
+                    type="numpy",
+                    label="Results",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=FIX_MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                fix_result_pose = gr.Gallery(
+                    type="numpy",
+                    label="Results Pose",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=FIX_MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">✨ Hit &quot;Clear&quot; to restart from the beginning</p>"""
+                )
+                fix_clear = gr.ClearButton()
+        gr.Markdown(
+            """<p style="text-align: left; font-size: 25px;"><b>More options</b></p>"""
+        )
+        gr.Markdown(
+            "⚠️ Currently, Number of generation > 1 could lead to out-of-memory"
+        )
+        with gr.Row():
+            fix_n_generation = gr.Slider(
+                label="Number of generations",
+                value=1,
+                minimum=1,
+                maximum=FIX_MAX_N,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+            fix_seed = gr.Slider(
+                label="Seed",
+                value=42,
+                minimum=0,
+                maximum=10000,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+            fix_cfg = gr.Slider(
+                label="Classifier free guidance scale",
+                value=3.0,
+                minimum=0.0,
+                maximum=10.0,
+                step=0.1,
+                randomize=False,
+                interactive=True,
+            )
+            fix_quality = gr.Slider(
+                label="Quality",
+                value=10,
+                minimum=1,
+                maximum=10,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+        fix_crop.change(enable_component, [fix_crop, fix_crop], fix_ref)
+        fix_crop.change(resize_to_full, fix_crop, fix_ref)
+        fix_ref.change(enable_component, [fix_ref, fix_ref], fix_finish_crop)
+        fix_finish_crop.click(get_mask_inpaint, [fix_ref], [fix_inpaint_mask])
+        # fix_finish_crop.click(lambda x: x["background"], [fix_ref], [fix_kp_right])
+        # fix_finish_crop.click(lambda x: x["background"], [fix_ref], [fix_kp_left])
+        fix_finish_crop.click(lambda x: x["background"], [fix_crop], [fix_original])
+        fix_finish_crop.click(visualize_ref, [fix_crop, fix_ref], [fix_img])
+        fix_img.change(lambda x: x, [fix_img], [fix_kp_right])
+        fix_img.change(lambda x: x, [fix_img], [fix_kp_left])
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_checkbox
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_right
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_right
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_right
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_left
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_left
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_left
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_ready
+        )
+        # fix_inpaint_mask.change(
+        #     enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_run
+        # )
+        fix_checkbox.select(
+            set_visible,
+            [fix_checkbox, fix_kpts, fix_img, fix_kp_right, fix_kp_left],
+            [
+                fix_kpts,
+                fix_kp_right,
+                fix_kp_left,
+                fix_kp_right,
+                fix_undo_right,
+                fix_reset_right,
+                fix_kp_left,
+                fix_undo_left,
+                fix_reset_left,
+                fix_kp_r_info,
+                fix_kp_l_info,
+            ],
+        )
+        fix_kp_right.select(
+            get_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts]
+        )
+        fix_undo_right.click(
+            undo_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts]
+        )
+        fix_reset_right.click(
+            reset_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts]
+        )
+        fix_kp_left.select(
+            get_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts]
+        )
+        fix_undo_left.click(
+            undo_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts]
+        )
+        fix_reset_left.click(
+            reset_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts]
+        )
+        # fix_kpts.change(check_keypoints, [fix_kpts], [fix_kp_right, fix_kp_left, fix_run])
+        # fix_run.click(lambda x:gr.update(value=None), [], [fix_result, fix_result_pose])
+        fix_vis_mask32.change(
+            enable_component, [fix_vis_mask32, fix_vis_mask256], fix_run
+        )
+        # fix_vis_mask32.change(
+        #     enable_component, [fix_vis_mask32, fix_vis_mask256], fix_mask_size
+        # )
+        fix_ready.click(
+            ready_sample,
+            [fix_original, fix_inpaint_mask, fix_kpts],
+            [
+                fix_ref_cond,
+                fix_target_cond,
+                fix_latent,
+                fix_inpaint_latent,
+                fix_kpts_np,
+                fix_vis_mask32,
+                fix_vis_mask256,
+            ],
+        )
+        # fix_mask_size.select(
+        #     switch_mask_size, [fix_mask_size], [fix_vis_mask32, fix_vis_mask256]
+        # )
+        fix_run.click(
+            sample_inpaint,
+            [
+                fix_ref_cond,
+                fix_target_cond,
+                fix_latent,
+                fix_inpaint_latent,
+                fix_kpts_np,
+                fix_n_generation,
+                fix_seed,
+                fix_cfg,
+                fix_quality,
+            ],
+            [fix_result, fix_result_pose],
+        )
+        fix_clear.click(
+            fix_clear_all,
+            [],
+            [
+                fix_crop,
+                fix_ref,
+                fix_kp_right,
+                fix_kp_left,
+                fix_result,
+                fix_result_pose,
+                fix_inpaint_mask,
+                fix_original,
+                fix_img,
+                fix_vis_mask32,
+                fix_vis_mask256,
+                fix_kpts,
+                fix_kpts_np,
+                fix_ref_cond,
+                fix_target_cond,
+                fix_latent,
+                fix_inpaint_latent,
+                fix_n_generation,
+                # fix_size_memory,
+                fix_seed,
+                fix_cfg,
+                fix_quality,
+            ],
+        )
+        # gr.Markdown("""<p style="font-size: 25px; font-weight: bold;">Examples</p>""")
+        # fix_dump_ex = gr.Image(value=None, label="Original Image", visible=False)
+        # fix_dump_ex_masked = gr.Image(value=None, label="After Brushing", visible=False)
+        # with gr.Column():
+        #     fix_example = gr.Examples(
+        #         fix_example_imgs,
+        #         # run_on_click=True,
+        #         # fn=parse_fix_example,
+        #         # inputs=[fix_dump_ex, fix_dump_ex_masked],
+        #         # outputs=[fix_original, fix_ref, fix_img, fix_inpaint_mask],
+        #         inputs=[fix_crop],
+        #         examples_per_page=20,
+        #     )
+    gr.Markdown("<h1>Citation</h1>")
+    gr.Markdown(
+        """<p style="text-align: left;">If this was useful, please cite us! ❤️</p>"""
+    )
+    gr.Markdown(_CITE_)
+print("Ready to launch..")
+_, _, shared_url = demo.queue().launch(
+    share=True, server_name="0.0.0.0", server_port=7739
+)
+# demo.launch(share=True)

no_hands.png ADDED Viewed

Git LFS Details

SHA256: 80a0c25741912005bc5619e9965d89cfa28832a898292320a8929bdb64a95229
Pointer size: 129 Bytes
Size of remote file: 9.31 kB