Spaces:

RamAnanth1
/

visual-chatGPT

Runtime error

App Files Files Community

RamAnanth1 commited on Mar 10, 2023

Commit

b6c5945

1 Parent(s): 9d3a8c0

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -551

app.py CHANGED Viewed

@@ -20,6 +20,8 @@ from langchain.llms.openai import OpenAI
 import re
 import uuid
 from diffusers import StableDiffusionInpaintPipeline
 from PIL import Image
 import numpy as np
 from omegaconf import OmegaConf
@@ -28,16 +30,6 @@ import cv2
 import einops
 from pytorch_lightning import seed_everything
 import random
-from ldm.util import instantiate_from_config
-from ControlNet.cldm.model import create_model, load_state_dict
-from ControlNet.cldm.ddim_hacked import DDIMSampler
-from ControlNet.annotator.canny import CannyDetector
-from ControlNet.annotator.mlsd import MLSDdetector
-from ControlNet.annotator.util import HWC3, resize_image
-from ControlNet.annotator.hed import HEDdetector, nms
-from ControlNet.annotator.openpose import OpenposeDetector
-from ControlNet.annotator.uniformer import UniformerDetector
-from ControlNet.annotator.midas import MidasDetector
 VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
@@ -223,7 +215,6 @@ class ImageCaptioning:
 class image2canny:
     def __init__(self):
         print("Direct detect canny.")
-        self.detector = CannyDetector()
         self.low_thresh = 100
         self.high_thresh = 200
@@ -231,558 +222,58 @@ class image2canny:
         print("===>Starting image2canny Inference")
         image = Image.open(inputs)
         image = np.array(image)
-        canny = self.detector(image, self.low_thresh, self.high_thresh)
-        canny = 255 - canny
-        image = Image.fromarray(canny)
         updated_image_path = get_new_image_name(inputs, func_name="edge")
-        image.save(updated_image_path)
         return updated_image_path
 class canny2image:
     def __init__(self, device):
         print("Initialize the canny2image model.")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_canny.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
     def inference(self, inputs):
         print("===>Starting canny2image Inference")
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         image = np.array(image)
-        image = 255 - image
         prompt = instruct_text
-        img = resize_image(HWC3(image), self.image_resolution)
-        H, W, C = img.shape
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
         updated_image_path = get_new_image_name(image_path, func_name="canny2image")
-        real_image = Image.fromarray(x_samples[0])  # get default the index0 image
-        real_image.save(updated_image_path)
-        return updated_image_path
-class image2line:
-    def __init__(self):
-        print("Direct detect straight line...")
-        self.detector = MLSDdetector()
-        self.value_thresh = 0.1
-        self.dis_thresh = 0.1
-        self.resolution = 512
-    def inference(self, inputs):
-        print("===>Starting image2hough Inference")
-        image = Image.open(inputs)
-        image = np.array(image)
-        image = HWC3(image)
-        hough = self.detector(resize_image(image, self.resolution), self.value_thresh, self.dis_thresh)
-        updated_image_path = get_new_image_name(inputs, func_name="line-of")
-        hough = 255 - cv2.dilate(hough, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
-        image = Image.fromarray(hough)
-        image.save(updated_image_path)
-        return updated_image_path
-class line2image:
-    def __init__(self, device):
-        print("Initialize the line2image model...")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_mlsd.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-    def inference(self, inputs):
-        print("===>Starting line2image Inference")
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
-        image = Image.open(image_path)
-        image = np.array(image)
-        image = 255 - image
-        prompt = instruct_text
-        img = resize_image(HWC3(image), self.image_resolution)
-        H, W, C = img.shape
-        img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).\
-            cpu().numpy().clip(0,255).astype(np.uint8)
-        updated_image_path = get_new_image_name(image_path, func_name="line2image")
-        real_image = Image.fromarray(x_samples[0])  # default the index0 image
-        real_image.save(updated_image_path)
-        return updated_image_path
-class image2hed:
-    def __init__(self):
-        print("Direct detect soft HED boundary...")
-        self.detector = HEDdetector()
-        self.resolution = 512
-    def inference(self, inputs):
-        print("===>Starting image2hed Inference")
-        image = Image.open(inputs)
-        image = np.array(image)
-        image = HWC3(image)
-        hed = self.detector(resize_image(image, self.resolution))
-        updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
-        image = Image.fromarray(hed)
-        image.save(updated_image_path)
-        return updated_image_path
-class hed2image:
-    def __init__(self, device):
-        print("Initialize the hed2image model...")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_hed.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-    def inference(self, inputs):
-        print("===>Starting hed2image Inference")
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
-        image = Image.open(image_path)
-        image = np.array(image)
-        prompt = instruct_text
-        img = resize_image(HWC3(image), self.image_resolution)
-        H, W, C = img.shape
-        img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-        updated_image_path = get_new_image_name(image_path, func_name="hed2image")
-        real_image = Image.fromarray(x_samples[0])  # default the index0 image
-        real_image.save(updated_image_path)
-        return updated_image_path
-class image2scribble:
-    def __init__(self):
-        print("Direct detect scribble.")
-        self.detector = HEDdetector()
-        self.resolution = 512
-    def inference(self, inputs):
-        print("===>Starting image2scribble Inference")
-        image = Image.open(inputs)
-        image = np.array(image)
-        image = HWC3(image)
-        detected_map = self.detector(resize_image(image, self.resolution))
-        detected_map = HWC3(detected_map)
-        image = resize_image(image, self.resolution)
-        H, W, C = image.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-        detected_map = nms(detected_map, 127, 3.0)
-        detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
-        detected_map[detected_map > 4] = 255
-        detected_map[detected_map < 255] = 0
-        detected_map = 255 - detected_map
-        updated_image_path = get_new_image_name(inputs, func_name="scribble")
-        image = Image.fromarray(detected_map)
-        image.save(updated_image_path)
-        return updated_image_path
-class scribble2image:
-    def __init__(self, device):
-        print("Initialize the scribble2image model...")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_scribble.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-    def inference(self, inputs):
-        print("===>Starting scribble2image Inference")
-        print(f'sketch device {self.device}')
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
-        image = Image.open(image_path)
-        image = np.array(image)
-        prompt = instruct_text
-        image = 255 - image
-        img = resize_image(HWC3(image), self.image_resolution)
-        H, W, C = img.shape
-        img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-        updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
-        real_image = Image.fromarray(x_samples[0])  # default the index0 image
-        real_image.save(updated_image_path)
-        return updated_image_path
-class image2pose:
-    def __init__(self):
-        print("Direct human pose.")
-        self.detector = OpenposeDetector()
-        self.resolution = 512
-    def inference(self, inputs):
-        print("===>Starting image2pose Inference")
-        image = Image.open(inputs)
-        image = np.array(image)
-        image = HWC3(image)
-        detected_map, _ = self.detector(resize_image(image, self.resolution))
-        detected_map = HWC3(detected_map)
-        image = resize_image(image, self.resolution)
-        H, W, C = image.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-        updated_image_path = get_new_image_name(inputs, func_name="human-pose")
-        image = Image.fromarray(detected_map)
-        image.save(updated_image_path)
-        return updated_image_path
-class pose2image:
-    def __init__(self, device):
-        print("Initialize the pose2image model...")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_openpose.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-    def inference(self, inputs):
-        print("===>Starting pose2image Inference")
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
-        image = Image.open(image_path)
-        image = np.array(image)
-        prompt = instruct_text
-        img = resize_image(HWC3(image), self.image_resolution)
-        H, W, C = img.shape
-        img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [ self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-        updated_image_path = get_new_image_name(image_path, func_name="pose2image")
-        real_image = Image.fromarray(x_samples[0])  # default the index0 image
-        real_image.save(updated_image_path)
-        return updated_image_path
-class image2seg:
-    def __init__(self):
-        print("Direct segmentations.")
-        self.detector = UniformerDetector()
-        self.resolution = 512
-    def inference(self, inputs):
-        print("===>Starting image2seg Inference")
-        image = Image.open(inputs)
-        image = np.array(image)
-        image = HWC3(image)
-        detected_map = self.detector(resize_image(image, self.resolution))
-        detected_map = HWC3(detected_map)
-        image = resize_image(image, self.resolution)
-        H, W, C = image.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-        updated_image_path = get_new_image_name(inputs, func_name="segmentation")
-        image = Image.fromarray(detected_map)
-        image.save(updated_image_path)
-        return updated_image_path
-class seg2image:
-    def __init__(self, device):
-        print("Initialize the seg2image model...")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_seg.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-    def inference(self, inputs):
-        print("===>Starting seg2image Inference")
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
-        image = Image.open(image_path)
-        image = np.array(image)
-        prompt = instruct_text
-        img = resize_image(HWC3(image), self.image_resolution)
-        H, W, C = img.shape
-        img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-        updated_image_path = get_new_image_name(image_path, func_name="segment2image")
-        real_image = Image.fromarray(x_samples[0])  # default the index0 image
-        real_image.save(updated_image_path)
-        return updated_image_path
-class image2depth:
-    def __init__(self):
-        print("Direct depth estimation.")
-        self.detector = MidasDetector()
-        self.resolution = 512
-    def inference(self, inputs):
-        print("===>Starting image2depth Inference")
-        image = Image.open(inputs)
-        image = np.array(image)
-        image = HWC3(image)
-        detected_map, _ = self.detector(resize_image(image, self.resolution))
-        detected_map = HWC3(detected_map)
-        image = resize_image(image, self.resolution)
-        H, W, C = image.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-        updated_image_path = get_new_image_name(inputs, func_name="depth")
-        image = Image.fromarray(detected_map)
-        image.save(updated_image_path)
-        return updated_image_path
-class depth2image:
-    def __init__(self, device):
-        print("Initialize depth2image model...")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_depth.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-    def inference(self, inputs):
-        print("===>Starting depth2image Inference")
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
-        image = Image.open(image_path)
-        image = np.array(image)
-        prompt = instruct_text
-        img = resize_image(HWC3(image), self.image_resolution)
-        H, W, C = img.shape
-        img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [ self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-        updated_image_path = get_new_image_name(image_path, func_name="depth2image")
-        real_image = Image.fromarray(x_samples[0])  # default the index0 image
-        real_image.save(updated_image_path)
-        return updated_image_path
-class image2normal:
-    def __init__(self):
-        print("Direct normal estimation.")
-        self.detector = MidasDetector()
-        self.resolution = 512
-        self.bg_threshold = 0.4
-    def inference(self, inputs):
-        print("===>Starting image2 normal Inference")
-        image = Image.open(inputs)
-        image = np.array(image)
-        image = HWC3(image)
-        _, detected_map = self.detector(resize_image(image, self.resolution), bg_th=self.bg_threshold)
-        detected_map = HWC3(detected_map)
-        image = resize_image(image, self.resolution)
-        H, W, C = image.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-        updated_image_path = get_new_image_name(inputs, func_name="normal-map")
-        image = Image.fromarray(detected_map)
-        image.save(updated_image_path)
-        return updated_image_path
-class normal2image:
-    def __init__(self, device):
-        print("Initialize normal2image model...")
-        model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
-        model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_normal.pth', location='cpu'))
-        self.model = model.to(device)
-        self.device = device
-        self.ddim_sampler = DDIMSampler(self.model)
-        self.ddim_steps = 20
-        self.image_resolution = 512
-        self.num_samples = 1
-        self.save_memory = False
-        self.strength = 1.0
-        self.guess_mode = False
-        self.scale = 9.0
-        self.seed = -1
-        self.a_prompt = 'best quality, extremely detailed'
-        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-    def inference(self, inputs):
-        print("===>Starting normal2image Inference")
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
-        image = Image.open(image_path)
-        image = np.array(image)
-        prompt = instruct_text
-        img = image[:, :, ::-1].copy()
-        img = resize_image(HWC3(img), self.image_resolution)
-        H, W, C = img.shape
-        img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
-        control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
-        control = torch.stack([control for _ in range(self.num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        self.seed = random.randint(0, 65535)
-        seed_everything(self.seed)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
-        un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
-        shape = (4, H // 8, W // 8)
-        self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
-        samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
-        if self.save_memory:
-            self.model.low_vram_shift(is_diffusing=False)
-        x_samples = self.model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
-        updated_image_path = get_new_image_name(image_path, func_name="normal2image")
-        real_image = Image.fromarray(x_samples[0])  # default the index0 image
         real_image.save(updated_image_path)
         return updated_image_path
@@ -961,4 +452,4 @@ with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
     clear.click(bot.memory.clear)
     clear.click(lambda: [], None, chatbot)
     clear.click(lambda: [], None, state)
-demo.launch()

 import re
 import uuid
 from diffusers import StableDiffusionInpaintPipeline
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+from diffusers import UniPCMultistepScheduler
 from PIL import Image
 import numpy as np
 from omegaconf import OmegaConf
 import einops
 from pytorch_lightning import seed_everything
 import random
 VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
 class image2canny:
     def __init__(self):
         print("Direct detect canny.")
         self.low_thresh = 100
         self.high_thresh = 200
         print("===>Starting image2canny Inference")
         image = Image.open(inputs)
         image = np.array(image)
+        image = cv2.Canny(image, low_threshold, high_threshold)
+        image = image[:, :, None]
+        image = np.concatenate([image, image, image], axis=2)
+        canny_image = Image.fromarray(image)
         updated_image_path = get_new_image_name(inputs, func_name="edge")
+        canny_image.save(updated_image_path)
         return updated_image_path
 class canny2image:
     def __init__(self, device):
         print("Initialize the canny2image model.")
+        low_threshold = 100
+        high_threshold = 200
+        # Models
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
+        )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        # This command loads the individual model components on GPU on-demand. So, we don't
+        # need to explicitly call pipe.to("cuda").
+        self.pipe.enable_model_cpu_offload()
+        self.pipe.enable_xformers_memory_efficient_attention()
+        # Generator seed,
+        self.generator = torch.manual_seed(0)
+    def get_canny_filter(self,image):
+        if not isinstance(image, np.ndarray):
+            image = np.array(image)
+        image = cv2.Canny(image, low_threshold, high_threshold)
+        image = image[:, :, None]
+        image = np.concatenate([image, image, image], axis=2)
+        canny_image = Image.fromarray(image)
+        return canny_image
     def inference(self, inputs):
         print("===>Starting canny2image Inference")
         image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         image = np.array(image)
         prompt = instruct_text
+        canny_image = self.get_canny_filter(image)
+        output = self.pipe(prompt,canny_image,generator=self.generator,num_images_per_prompt=1,num_inference_steps=20)
         updated_image_path = get_new_image_name(image_path, func_name="canny2image")
+        real_image = Image.fromarray(output.images[0])  # get default the index0 image
         real_image.save(updated_image_path)
         return updated_image_path
     clear.click(bot.memory.clear)
     clear.click(lambda: [], None, chatbot)
     clear.click(lambda: [], None, state)
+demo.launch()