Jacobmadwed's picture
Upload 38 files
4bc5f6c verified
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md
import os
import sys
import time
import subprocess
from cog import BasePredictor, Input, Path
import cv2
import torch
import numpy as np
from PIL import Image
from diffusers.utils import load_image
from diffusers.models import ControlNetModel
from insightface.app import FaceAnalysis
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from pipeline_stable_diffusion_xl_instantid import (
StableDiffusionXLInstantIDPipeline,
draw_kps,
)
# for `ip-adaper`, `ControlNetModel`, and `stable-diffusion-xl-base-1.0`
CHECKPOINTS_CACHE = "./checkpoints"
CHECKPOINTS_URL = (
"https://weights.replicate.delivery/default/InstantID/checkpoints.tar"
)
# for `models/antelopev2`
MODELS_CACHE = "./models"
MODELS_URL = "https://weights.replicate.delivery/default/InstantID/models.tar"
def resize_img(
input_image,
max_side=1280,
min_side=1024,
size=None,
pad_to_max_side=False,
mode=Image.BILINEAR,
base_pixel_number=64,
):
w, h = input_image.size
if size is not None:
w_resize_new, h_resize_new = size
else:
ratio = min_side / min(h, w)
w, h = round(ratio * w), round(ratio * h)
ratio = max_side / max(h, w)
input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
input_image = input_image.resize([w_resize_new, h_resize_new], mode)
if pad_to_max_side:
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
offset_x = (max_side - w_resize_new) // 2
offset_y = (max_side - h_resize_new) // 2
res[
offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new
] = np.array(input_image)
input_image = Image.fromarray(res)
return input_image
def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
if not os.path.exists(CHECKPOINTS_CACHE):
download_weights(CHECKPOINTS_URL, CHECKPOINTS_CACHE)
if not os.path.exists(MODELS_CACHE):
download_weights(MODELS_URL, MODELS_CACHE)
self.width, self.height = 640, 640
self.app = FaceAnalysis(
name="antelopev2",
root="./",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
self.app.prepare(ctx_id=0, det_size=(self.width, self.height))
# Path to InstantID models
face_adapter = f"./checkpoints/ip-adapter.bin"
controlnet_path = f"./checkpoints/ControlNetModel"
# Load pipeline
self.controlnet = ControlNetModel.from_pretrained(
controlnet_path,
torch_dtype=torch.float16,
cache_dir=CHECKPOINTS_CACHE,
local_files_only=True,
)
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
self.pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
base_model_path,
controlnet=self.controlnet,
torch_dtype=torch.float16,
cache_dir=CHECKPOINTS_CACHE,
local_files_only=True,
)
self.pipe.cuda()
self.pipe.load_ip_adapter_instantid(face_adapter)
def predict(
self,
image: Path = Input(description="Input image"),
prompt: str = Input(
description="Input prompt",
default="analog film photo of a man. faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage, masterpiece, best quality",
),
negative_prompt: str = Input(
description="Input Negative Prompt",
default="",
),
width: int = Input(
description="Width of output image",
default=640,
ge=512,
le=2048,
),
height: int = Input(
description="Height of output image",
default=640,
ge=512,
le=2048,
),
ip_adapter_scale: float = Input(
description="Scale for IP adapter",
default=0.8,
ge=0,
le=1,
),
controlnet_conditioning_scale: float = Input(
description="Scale for ControlNet conditioning",
default=0.8,
ge=0,
le=1,
),
num_inference_steps: int = Input(
description="Number of denoising steps",
default=30,
ge=1,
le=500,
),
guidance_scale: float = Input(
description="Scale for classifier-free guidance",
default=5,
ge=1,
le=50,
),
) -> Path:
"""Run a single prediction on the model"""
if self.width != width or self.height != height:
print(f"[!] Resizing output to {width}x{height}")
self.width = width
self.height = height
self.app.prepare(ctx_id=0, det_size=(self.width, self.height))
face_image = load_image(str(image))
face_image = resize_img(face_image)
face_info = self.app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
face_info = sorted(
face_info,
key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]),
reverse=True,
)[
0
] # only use the maximum face
face_emb = face_info["embedding"]
face_kps = draw_kps(face_image, face_info["kps"])
self.pipe.set_ip_adapter_scale(ip_adapter_scale)
image = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
image_embeds=face_emb,
image=face_kps,
controlnet_conditioning_scale=controlnet_conditioning_scale,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
).images[0]
output_path = "result.jpg"
image.save(output_path)
return Path(output_path)