# Prediction interface for Cog ⚙️ # https://github.com/replicate/cog/blob/main/docs/python.md import os import sys import time import subprocess from cog import BasePredictor, Input, Path import cv2 import torch import numpy as np from PIL import Image from diffusers.utils import load_image from diffusers.models import ControlNetModel from insightface.app import FaceAnalysis sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from pipeline_stable_diffusion_xl_instantid import ( StableDiffusionXLInstantIDPipeline, draw_kps, ) # for `ip-adaper`, `ControlNetModel`, and `stable-diffusion-xl-base-1.0` CHECKPOINTS_CACHE = "./checkpoints" CHECKPOINTS_URL = ( "https://weights.replicate.delivery/default/InstantID/checkpoints.tar" ) # for `models/antelopev2` MODELS_CACHE = "./models" MODELS_URL = "https://weights.replicate.delivery/default/InstantID/models.tar" def resize_img( input_image, max_side=1280, min_side=1024, size=None, pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64, ): w, h = input_image.size if size is not None: w_resize_new, h_resize_new = size else: ratio = min_side / min(h, w) w, h = round(ratio * w), round(ratio * h) ratio = max_side / max(h, w) input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode) w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number input_image = input_image.resize([w_resize_new, h_resize_new], mode) if pad_to_max_side: res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255 offset_x = (max_side - w_resize_new) // 2 offset_y = (max_side - h_resize_new) // 2 res[ offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new ] = np.array(input_image) input_image = Image.fromarray(res) return input_image def download_weights(url, dest): start = time.time() print("downloading url: ", url) print("downloading to: ", dest) subprocess.check_call(["pget", "-x", url, dest], close_fds=False) print("downloading took: ", time.time() - start) class Predictor(BasePredictor): def setup(self) -> None: """Load the model into memory to make running multiple predictions efficient""" if not os.path.exists(CHECKPOINTS_CACHE): download_weights(CHECKPOINTS_URL, CHECKPOINTS_CACHE) if not os.path.exists(MODELS_CACHE): download_weights(MODELS_URL, MODELS_CACHE) self.width, self.height = 640, 640 self.app = FaceAnalysis( name="antelopev2", root="./", providers=["CUDAExecutionProvider", "CPUExecutionProvider"], ) self.app.prepare(ctx_id=0, det_size=(self.width, self.height)) # Path to InstantID models face_adapter = f"./checkpoints/ip-adapter.bin" controlnet_path = f"./checkpoints/ControlNetModel" # Load pipeline self.controlnet = ControlNetModel.from_pretrained( controlnet_path, torch_dtype=torch.float16, cache_dir=CHECKPOINTS_CACHE, local_files_only=True, ) base_model_path = "stabilityai/stable-diffusion-xl-base-1.0" self.pipe = StableDiffusionXLInstantIDPipeline.from_pretrained( base_model_path, controlnet=self.controlnet, torch_dtype=torch.float16, cache_dir=CHECKPOINTS_CACHE, local_files_only=True, ) self.pipe.cuda() self.pipe.load_ip_adapter_instantid(face_adapter) def predict( self, image: Path = Input(description="Input image"), prompt: str = Input( description="Input prompt", default="analog film photo of a man. faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage, masterpiece, best quality", ), negative_prompt: str = Input( description="Input Negative Prompt", default="", ), width: int = Input( description="Width of output image", default=640, ge=512, le=2048, ), height: int = Input( description="Height of output image", default=640, ge=512, le=2048, ), ip_adapter_scale: float = Input( description="Scale for IP adapter", default=0.8, ge=0, le=1, ), controlnet_conditioning_scale: float = Input( description="Scale for ControlNet conditioning", default=0.8, ge=0, le=1, ), num_inference_steps: int = Input( description="Number of denoising steps", default=30, ge=1, le=500, ), guidance_scale: float = Input( description="Scale for classifier-free guidance", default=5, ge=1, le=50, ), ) -> Path: """Run a single prediction on the model""" if self.width != width or self.height != height: print(f"[!] Resizing output to {width}x{height}") self.width = width self.height = height self.app.prepare(ctx_id=0, det_size=(self.width, self.height)) face_image = load_image(str(image)) face_image = resize_img(face_image) face_info = self.app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR)) face_info = sorted( face_info, key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]), reverse=True, )[ 0 ] # only use the maximum face face_emb = face_info["embedding"] face_kps = draw_kps(face_image, face_info["kps"]) self.pipe.set_ip_adapter_scale(ip_adapter_scale) image = self.pipe( prompt=prompt, negative_prompt=negative_prompt, image_embeds=face_emb, image=face_kps, controlnet_conditioning_scale=controlnet_conditioning_scale, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, ).images[0] output_path = "result.jpg" image.save(output_path) return Path(output_path)