import gradio as gr from PIL import Image, ImageColor #my back_task code being imported from back_task import * # The function that does the hard work def generate(radio,color,prompt, guidance_loss_scale): print(color) if radio == "color guidance": target_color = ImageColor.getcolor(color, "RGB") # Target color as RGB target_color = [a / 255 for a in target_color] # Rescale from (0, 255) to (0, 1) elif radio == "text guidance": # We embed a prompt with CLIP as our target text = open_clip.tokenize([prompt]).to(device) with torch.no_grad(), torch.cuda.amp.autocast(): text_features = clip_model.encode_text(text) x = torch.randn(1, 3, 256, 256).to(device) for i, t in tqdm(enumerate(scheduler.timesteps)): model_input = scheduler.scale_model_input(x, t) with torch.no_grad(): noise_pred = image_pipe.unet(model_input, t)["sample"] if radio == "color guidance": x = x.detach().requires_grad_() x0 = scheduler.step(noise_pred, t, x).pred_original_sample loss = color_loss(x0, target_color) * guidance_loss_scale cond_grad = -torch.autograd.grad(loss, x)[0] x = x.detach() + cond_grad elif radio == "text guidance": cond_grad = 0 for cut in range(n_cuts): # Set requires grad on x x = x.detach().requires_grad_() # Get the predicted x0: x0 = scheduler.step(noise_pred, t, x).pred_original_sample # Calculate loss loss = clip_loss(x0, text_features) * guidance_loss_scale # Get gradient (scale by n_cuts since we want the average) cond_grad -= torch.autograd.grad(loss, x)[0] / n_cuts # Modify x based on this gradient alpha_bar = scheduler.alphas_cumprod[i] x = x.detach() + cond_grad * alpha_bar.sqrt() # Note the additional scaling factor here! x = scheduler.step(noise_pred, t, x).prev_sample grid = torchvision.utils.make_grid(x, nrow=4) im = grid.permute(1, 2, 0).cpu().clip(-1, 1) * 0.5 + 0.5 im = Image.fromarray(np.array(im * 255).astype(np.uint8)) # im.save("test.jpeg") return im title="""
Try-out of exercise from HF Learn [Difussion Course]
😅 Inference is very very slow 🐌 since I am using HF's free cpu 😉