Spaces:

snap-research
/

weights2weights

Running on Zero

App Files Files Community

amildravid4292 commited on Jul 20, 2024

Commit

9935195

verified ·

1 Parent(s): d169e6d

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -178

app.py CHANGED Viewed

@@ -88,166 +88,166 @@ class main():
         thick = debias(thick, "Heavy_Makeup", df, pinverse, device)
         self.thick = thick
-        def sample_model(self):
-            self.unet, _, _, _, _ = load_models(self.device)
-            self.network = sample_weights(self.unet, self.proj, self.mean, self.std, self.v[:, :1000], self.device, factor = 1.00)
-        @torch.no_grad()
-        @spaces.GPU
-        def inference(self, prompt, negative_prompt, guidance_scale, ddim_steps, seed):
-            generator = torch.Generator(device=device).manual_seed(seed)
-            latents = torch.randn(
                 (1, self.unet.in_channels, 512 // 8, 512 // 8),
                 generator = generator,
                 device = self.device
             ).bfloat16()
-            text_input = self.tokenizer(prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
-            text_embeddings = self.text_encoder(text_input.input_ids.to(device))[0]
-            max_length = text_input.input_ids.shape[-1]
-            uncond_input = self.tokenizer(
                                     [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                                 )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-            self.noise_scheduler.set_timesteps(ddim_steps)
-            latents = latents * self.noise_scheduler.init_noise_sigma
-            for i,t in enumerate(tqdm.tqdm(self.noise_scheduler.timesteps)):
-                latent_model_input = torch.cat([latents] * 2)
-                latent_model_input = self.noise_scheduler.scale_model_input(latent_model_input, timestep=t)
-                with self.network:
-                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
                 #guidance
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
-            latents = 1 / 0.18215 * latents
-            image = self.vae.decode(latents).sample
-            image = (image / 2 + 0.5).clamp(0, 1)
-            image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
-            image = Image.fromarray((image * 255).round().astype("uint8"))
-            return image
-        @torch.no_grad()
-        @spaces.GPU
-        def edit_inference(self, prompt, negative_prompt, guidance_scale, ddim_steps, seed, start_noise, a1, a2, a3, a4):
-            original_weights = self,network.proj.clone()
-            #pad to same number of PCs
-            pcs_original = original_weights.shape[1]
-            pcs_edits = self.young.shape[1]
-            padding =  torch.zeros((1,pcs_original-pcs_edits)).to(device)
-            young_pad = torch.cat((self.young, padding), 1)
-            pointy_pad = torch.cat((self.pointy, padding), 1)
-            wavy_pad = torch.cat((self.wavy, padding), 1)
-            thick_pad = torch.cat((self.thick, padding), 1)
-            edited_weights = original_weights+a1*1e6*young_pad+a2*1e6*pointy_pad+a3*1e6*wavy_pad+a4*2e6*thick_pad
-            generator = torch.Generator(device=device).manual_seed(seed)
-            latents = torch.randn(
                 (1, self.unet.in_channels, 512 // 8, 512 // 8),
                 generator = generator,
                 device = self.device
             ).bfloat16()
-            text_input = self.tokenizer(prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
-            text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
-            max_length = text_input.input_ids.shape[-1]
-            uncond_input = tokenizer(
                                     [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                                 )
-            uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-            noise_scheduler.set_timesteps(ddim_steps)
-            latents = latents * noise_scheduler.init_noise_sigma
-            for i,t in enumerate(tqdm.tqdm(self.noise_scheduler.timesteps)):
-                latent_model_input = torch.cat([latents] * 2)
-                latent_model_input = self.noise_scheduler.scale_model_input(latent_model_input, timestep=t)
-                if t>start_noise:
-                    pass
-                elif t<=start_noise:
-                    self.network.proj = torch.nn.Parameter(edited_weights)
-                    self.network.reset()
-                with self.network:
-                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
-                #guidance
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
-            latents = 1 / 0.18215 * latents
-            image = self.vae.decode(latents).sample
-            image = (image / 2 + 0.5).clamp(0, 1)
-            image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
-            image = Image.fromarray((image * 255).round().astype("uint8"))
-            #reset weights back to original
-            self.network.proj = torch.nn.Parameter(original_weights)
-            self.network.reset()
-            return image
-        @spaces.GPU
-        def sample_then_run(self):
-            sample_model()
-            prompt = "sks person"
-            negative_prompt = "low quality, blurry, unfinished, nudity, weapon"
-            seed = 5
-            cfg = 3.0
-            steps = 25
-            image = inference( prompt, negative_prompt, cfg, steps, seed)
-            torch.save(self.network.proj, "model.pt" )
-            return image, "model.pt"
-        class CustomImageDataset(Dataset):
-            def __init__(self, images, transform=None):
-                self.images = images
-                self.transform = transform
-            def __len__(self):
-                return len(self.images)
-            def __getitem__(self, idx):
-                image = self.images[idx]
-                if self.transform:
-                    image = self.transform(image)
-                return image
-        @spaces.GPU
-        def invert(self, image, mask, pcs=10000, epochs=400, weight_decay = 1e-10, lr=1e-1):
-            del unet
-            del network
-            unet, _, _, _, _ = load_models(device)
-            proj = torch.zeros(1,pcs).bfloat16().to(device)
-            network = LoRAw2w( proj, mean, std, v[:, :pcs],
                                 unet,
                                 rank=1,
                                 multiplier=1.0,
@@ -255,87 +255,87 @@ class main():
                                 train_method="xattn-strict"
                             ).to(device, torch.bfloat16)
-            ### load mask
-            mask = transforms.Resize((64,64), interpolation=transforms.InterpolationMode.BILINEAR)(mask)
-            mask = torchvision.transforms.functional.pil_to_tensor(mask).unsqueeze(0).to(device).bfloat16()[:,0,:,:].unsqueeze(1)
-            ### check if an actual mask was draw, otherwise mask is just all ones
-            if torch.sum(mask) == 0:
-                mask = torch.ones((1,1,64,64)).to(device).bfloat16()
-            ### single image dataset
-            image_transforms = transforms.Compose([transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR),
                                                         transforms.RandomCrop(512),
                                                         transforms.ToTensor(),
                                                         transforms.Normalize([0.5], [0.5])])
-            train_dataset = CustomImageDataset(image, transform=image_transforms)
-            train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True)
-            ### optimizer
-            optim = torch.optim.Adam(network.parameters(), lr=lr, weight_decay=weight_decay)
-            ### training loop
-            unet.train()
-            for epoch in tqdm.tqdm(range(epochs)):
-                for batch in train_dataloader:
-                    ### prepare inputs
-                    batch = batch.to(device).bfloat16()
-                    latents = vae.encode(batch).latent_dist.sample()
-                    latents = latents*0.18215
-                    noise = torch.randn_like(latents)
-                    bsz = latents.shape[0]
-                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-                    timesteps = timesteps.long()
-                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-                    text_input = tokenizer("sks person", padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
-                    text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
-                    ### loss + sgd step
-                    with network:
-                        model_pred = unet(noisy_latents, timesteps, text_embeddings).sample
-                        loss = torch.nn.functional.mse_loss(mask*model_pred.float(), mask*noise.float(), reduction="mean")
-                        optim.zero_grad()
-                        loss.backward()
-                        optim.step()
-            ### return optimized network
-            return network
-        @spaces.GPU
-        def run_inversion(self, dict, pcs, epochs, weight_decay,lr):
-            init_image = dict["image"].convert("RGB").resize((512, 512))
-            mask = dict["mask"].convert("RGB").resize((512, 512))
-            network = invert([init_image], mask, pcs, epochs, weight_decay,lr)
-            #sample an image
-            prompt = "sks person"
-            negative_prompt = "low quality, blurry, unfinished, nudity"
-            seed = 5
-            cfg = 3.0
-            steps = 25
-            image = inference( prompt, negative_prompt, cfg, steps, seed)
-            torch.save(network.proj, "model.pt" )
-            return image, "model.pt"
-        @spaces.GPU
-        def file_upload(self, file):
-            proj = torch.load(file.name).to(device)
-            #pad to 10000 Principal components to keep everything consistent
-            pcs = proj.shape[1]
-            padding =  torch.zeros((1,10000-pcs)).to(device)
-            proj = torch.cat((proj, padding), 1)
-            unet, _, _, _, _ = load_models(device)
-            network = LoRAw2w( proj, mean, std, v[:, :10000],
                                 unet,
                                 rank=1,
                                 multiplier=1.0,
@@ -344,13 +344,13 @@ class main():
                             ).to(device, torch.bfloat16)
-            prompt = "sks person"
-            negative_prompt = "low quality, blurry, unfinished, nudity"
-            seed = 5
-            cfg = 3.0
-            steps = 25
-            image = inference( prompt, negative_prompt, cfg, steps, seed)
-            return image

         thick = debias(thick, "Heavy_Makeup", df, pinverse, device)
         self.thick = thick
+    def sample_model(self):
+        self.unet, _, _, _, _ = load_models(self.device)
+        self.network = sample_weights(self.unet, self.proj, self.mean, self.std, self.v[:, :1000], self.device, factor = 1.00)
+    @torch.no_grad()
+    @spaces.GPU
+    def inference(self, prompt, negative_prompt, guidance_scale, ddim_steps, seed):
+        generator = torch.Generator(device=device).manual_seed(seed)
+        latents = torch.randn(
                 (1, self.unet.in_channels, 512 // 8, 512 // 8),
                 generator = generator,
                 device = self.device
             ).bfloat16()
+        text_input = self.tokenizer(prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
+        text_embeddings = self.text_encoder(text_input.input_ids.to(device))[0]
+        max_length = text_input.input_ids.shape[-1]
+        uncond_input = self.tokenizer(
                                     [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                                 )
+        uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        self.noise_scheduler.set_timesteps(ddim_steps)
+        latents = latents * self.noise_scheduler.init_noise_sigma
+        for i,t in enumerate(tqdm.tqdm(self.noise_scheduler.timesteps)):
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = self.noise_scheduler.scale_model_input(latent_model_input, timestep=t)
+            with self.network:
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
                 #guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
+        image = Image.fromarray((image * 255).round().astype("uint8"))
+        return image
+    @torch.no_grad()
+    @spaces.GPU
+    def edit_inference(self, prompt, negative_prompt, guidance_scale, ddim_steps, seed, start_noise, a1, a2, a3, a4):
+        original_weights = self,network.proj.clone()
+        #pad to same number of PCs
+        pcs_original = original_weights.shape[1]
+        pcs_edits = self.young.shape[1]
+        padding =  torch.zeros((1,pcs_original-pcs_edits)).to(device)
+        young_pad = torch.cat((self.young, padding), 1)
+        pointy_pad = torch.cat((self.pointy, padding), 1)
+        wavy_pad = torch.cat((self.wavy, padding), 1)
+        thick_pad = torch.cat((self.thick, padding), 1)
+        edited_weights = original_weights+a1*1e6*young_pad+a2*1e6*pointy_pad+a3*1e6*wavy_pad+a4*2e6*thick_pad
+        generator = torch.Generator(device=device).manual_seed(seed)
+        latents = torch.randn(
                 (1, self.unet.in_channels, 512 // 8, 512 // 8),
                 generator = generator,
                 device = self.device
             ).bfloat16()
+        text_input = self.tokenizer(prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
+        text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
+        max_length = text_input.input_ids.shape[-1]
+        uncond_input = tokenizer(
                                     [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                                 )
+        uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        noise_scheduler.set_timesteps(ddim_steps)
+        latents = latents * noise_scheduler.init_noise_sigma
+        for i,t in enumerate(tqdm.tqdm(self.noise_scheduler.timesteps)):
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = self.noise_scheduler.scale_model_input(latent_model_input, timestep=t)
+            if t>start_noise:
+                pass
+            elif t<=start_noise:
+                self.network.proj = torch.nn.Parameter(edited_weights)
+                self.network.reset()
+            with self.network:
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
+            #guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
+        image = Image.fromarray((image * 255).round().astype("uint8"))
+        #reset weights back to original
+        self.network.proj = torch.nn.Parameter(original_weights)
+        self.network.reset()
+        return image
+    @spaces.GPU
+    def sample_then_run(self):
+        sample_model()
+        prompt = "sks person"
+        negative_prompt = "low quality, blurry, unfinished, nudity, weapon"
+        seed = 5
+        cfg = 3.0
+        steps = 25
+        image = inference( prompt, negative_prompt, cfg, steps, seed)
+        torch.save(self.network.proj, "model.pt" )
+        return image, "model.pt"
+    class CustomImageDataset(Dataset):
+        def __init__(self, images, transform=None):
+            self.images = images
+            self.transform = transform
+        def __len__(self):
+            return len(self.images)
+        def __getitem__(self, idx):
+            image = self.images[idx]
+            if self.transform:
+                image = self.transform(image)
+            return image
+    @spaces.GPU
+    def invert(self, image, mask, pcs=10000, epochs=400, weight_decay = 1e-10, lr=1e-1):
+        del unet
+        del network
+        unet, _, _, _, _ = load_models(device)
+        proj = torch.zeros(1,pcs).bfloat16().to(device)
+        network = LoRAw2w( proj, mean, std, v[:, :pcs],
                                 unet,
                                 rank=1,
                                 multiplier=1.0,
                                 train_method="xattn-strict"
                             ).to(device, torch.bfloat16)
+        ### load mask
+        mask = transforms.Resize((64,64), interpolation=transforms.InterpolationMode.BILINEAR)(mask)
+        mask = torchvision.transforms.functional.pil_to_tensor(mask).unsqueeze(0).to(device).bfloat16()[:,0,:,:].unsqueeze(1)
+        ### check if an actual mask was draw, otherwise mask is just all ones
+        if torch.sum(mask) == 0:
+            mask = torch.ones((1,1,64,64)).to(device).bfloat16()
+        ### single image dataset
+        image_transforms = transforms.Compose([transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR),
                                                         transforms.RandomCrop(512),
                                                         transforms.ToTensor(),
                                                         transforms.Normalize([0.5], [0.5])])
+        train_dataset = CustomImageDataset(image, transform=image_transforms)
+        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True)
+        ### optimizer
+        optim = torch.optim.Adam(network.parameters(), lr=lr, weight_decay=weight_decay)
+        ### training loop
+        unet.train()
+        for epoch in tqdm.tqdm(range(epochs)):
+            for batch in train_dataloader:
+                ### prepare inputs
+                batch = batch.to(device).bfloat16()
+                latents = vae.encode(batch).latent_dist.sample()
+                latents = latents*0.18215
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                text_input = tokenizer("sks person", padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+                text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
+                ### loss + sgd step
+                with network:
+                    model_pred = unet(noisy_latents, timesteps, text_embeddings).sample
+                    loss = torch.nn.functional.mse_loss(mask*model_pred.float(), mask*noise.float(), reduction="mean")
+                    optim.zero_grad()
+                    loss.backward()
+                    optim.step()
+        ### return optimized network
+        return network
+    @spaces.GPU
+    def run_inversion(self, dict, pcs, epochs, weight_decay,lr):
+        init_image = dict["image"].convert("RGB").resize((512, 512))
+        mask = dict["mask"].convert("RGB").resize((512, 512))
+        network = invert([init_image], mask, pcs, epochs, weight_decay,lr)
+        #sample an image
+        prompt = "sks person"
+        negative_prompt = "low quality, blurry, unfinished, nudity"
+        seed = 5
+        cfg = 3.0
+        steps = 25
+        image = inference( prompt, negative_prompt, cfg, steps, seed)
+        torch.save(network.proj, "model.pt" )
+        return image, "model.pt"
+    @spaces.GPU
+    def file_upload(self, file):
+        proj = torch.load(file.name).to(device)
+        #pad to 10000 Principal components to keep everything consistent
+        pcs = proj.shape[1]
+        padding =  torch.zeros((1,10000-pcs)).to(device)
+        proj = torch.cat((proj, padding), 1)
+        unet, _, _, _, _ = load_models(device)
+        network = LoRAw2w( proj, mean, std, v[:, :10000],
                                 unet,
                                 rank=1,
                                 multiplier=1.0,
                             ).to(device, torch.bfloat16)
+        prompt = "sks person"
+        negative_prompt = "low quality, blurry, unfinished, nudity"
+        seed = 5
+        cfg = 3.0
+        steps = 25
+        image = inference( prompt, negative_prompt, cfg, steps, seed)
+        return image