Spaces:

snap-research
/

weights2weights

Running on Zero

App Files Files Community

amildravid4292 commited on Jul 20, 2024

Commit

ad7da82

verified ·

1 Parent(s): df9e08f

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -102

app.py CHANGED Viewed

@@ -42,53 +42,85 @@ pinverse = torch.load(f"{models_path}/files/pinverse_1000pc.pt", map_location=to
 unet.value, vae.value, text_encoder.value, tokenizer.value, noise_scheduler.value = load_models(device.value)
 def sample_model():
-    unet.value, _, _, _, _ = load_models(device)
-    network.value = sample_weights(unet, proj, mean, std, v[:, :1000], device, factor = 1.00)
 @torch.no_grad()
 @spaces.GPU
 def inference( prompt, negative_prompt, guidance_scale, ddim_steps, seed):
-    global device
-    #global generator
-    global unet
-    global vae
-    global text_encoder
-    global tokenizer
-    global noise_scheduler
-    generator = torch.Generator(device=device).manual_seed(seed)
     latents = torch.randn(
         (1, unet.in_channels, 512 // 8, 512 // 8),
         generator = generator,
-        device = device
     ).bfloat16()
-    text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
-    text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
     max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
-    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-    noise_scheduler.set_timesteps(ddim_steps)
-    latents = latents * noise_scheduler.init_noise_sigma
-    for i,t in enumerate(tqdm.tqdm(noise_scheduler.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
-        latent_model_input = noise_scheduler.scale_model_input(latent_model_input, timestep=t)
-        with network:
-            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
         latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
-    image = vae.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
@@ -100,78 +132,66 @@ def inference( prompt, negative_prompt, guidance_scale, ddim_steps, seed):
 @torch.no_grad()
 @spaces.GPU
 def edit_inference(prompt, negative_prompt, guidance_scale, ddim_steps, seed, start_noise, a1, a2, a3, a4):
-    start_items()
-    global device
-    #global generator
-    global unet
-    global vae
-    global text_encoder
-    global tokenizer
-    global noise_scheduler
-    global young
-    global pointy
-    global wavy
-    global thick
-    original_weights = network.proj.clone()
     #pad to same number of PCs
     pcs_original = original_weights.shape[1]
-    pcs_edits = young.shape[1]
     padding =  torch.zeros((1,pcs_original-pcs_edits)).to(device)
-    young_pad = torch.cat((young, padding), 1)
-    pointy_pad = torch.cat((pointy, padding), 1)
-    wavy_pad = torch.cat((wavy, padding), 1)
-    thick_pad = torch.cat((thick, padding), 1)
     edited_weights = original_weights+a1*1e6*young_pad+a2*1e6*pointy_pad+a3*1e6*wavy_pad+a4*2e6*thick_pad
-    generator = torch.Generator(device=device).manual_seed(seed)
     latents = torch.randn(
         (1, unet.in_channels, 512 // 8, 512 // 8),
         generator = generator,
-        device = device
     ).bfloat16()
-    text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
-    text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
     max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
-    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-    noise_scheduler.set_timesteps(ddim_steps)
-    latents = latents * noise_scheduler.init_noise_sigma
-    for i,t in enumerate(tqdm.tqdm(noise_scheduler.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
-        latent_model_input = noise_scheduler.scale_model_input(latent_model_input, timestep=t)
         if t>start_noise:
             pass
         elif t<=start_noise:
-            network.proj = torch.nn.Parameter(edited_weights)
-            network.reset()
         with network:
-            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
-    image = vae.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
@@ -179,8 +199,8 @@ def edit_inference(prompt, negative_prompt, guidance_scale, ddim_steps, seed, st
     image = Image.fromarray((image * 255).round().astype("uint8"))
     #reset weights back to original
-    network.proj = torch.nn.Parameter(original_weights)
-    network.reset()
     return image
@@ -193,52 +213,9 @@ def sample_then_run():
     cfg = 3.0
     steps = 25
     image = inference( prompt, negative_prompt, cfg, steps, seed)
-    torch.save(network.proj, "model.pt" )
     return image, "model.pt"
-#@spaces.GPU
-def start_items():
-    print("Starting items")
-    global young
-    global pointy
-    global wavy
-    global thick
-    young = get_direction(df, "Young", pinverse, 1000, device)
-    young = debias(young, "Male", df, pinverse, device)
-    young = debias(young, "Pointy_Nose", df, pinverse, device)
-    young = debias(young, "Wavy_Hair", df, pinverse, device)
-    young = debias(young, "Chubby", df, pinverse, device)
-    young = debias(young, "No_Beard", df, pinverse, device)
-    young = debias(young, "Mustache", df, pinverse, device)
-    pointy = get_direction(df, "Pointy_Nose", pinverse, 1000, device)
-    pointy = debias(pointy, "Young", df, pinverse, device)
-    pointy = debias(pointy, "Male", df, pinverse, device)
-    pointy = debias(pointy, "Wavy_Hair", df, pinverse, device)
-    pointy = debias(pointy, "Chubby", df, pinverse, device)
-    pointy = debias(pointy, "Heavy_Makeup", df, pinverse, device)
-    wavy = get_direction(df, "Wavy_Hair", pinverse, 1000, device)
-    wavy = debias(wavy, "Young", df, pinverse, device)
-    wavy = debias(wavy, "Male", df, pinverse, device)
-    wavy = debias(wavy, "Pointy_Nose", df, pinverse, device)
-    wavy = debias(wavy, "Chubby", df, pinverse, device)
-    wavy = debias(wavy, "Heavy_Makeup", df, pinverse, device)
-    thick = get_direction(df, "Bushy_Eyebrows", pinverse, 1000, device)
-    thick = debias(thick, "Male", df, pinverse, device)
-    thick = debias(thick, "Young", df, pinverse, device)
-    thick = debias(thick, "Pointy_Nose", df, pinverse, device)
-    thick = debias(thick, "Wavy_Hair", df, pinverse, device)
-    thick = debias(thick, "Mustache", df, pinverse, device)
-    thick = debias(thick, "No_Beard", df, pinverse, device)
-    thick = debias(thick, "Sideburns", df, pinverse, device)
-    thick = debias(thick, "Big_Nose", df, pinverse, device)
-    thick = debias(thick, "Big_Lips", df, pinverse, device)
-    thick = debias(thick, "Black_Hair", df, pinverse, device)
-    thick = debias(thick, "Brown_Hair", df, pinverse, device)
-    thick = debias(thick, "Pale_Skin", df, pinverse, device)
-    thick = debias(thick, "Heavy_Makeup", df, pinverse, device)
 class CustomImageDataset(Dataset):
     def __init__(self, images, transform=None):

 unet.value, vae.value, text_encoder.value, tokenizer.value, noise_scheduler.value = load_models(device.value)
+gr.State(young) = get_direction(df, "Young", pinverse, 1000, device.value)
+young.value = debias(young.value, "Male", df, pinverse, device.value)
+young.value = debias(young.value, "Pointy_Nose", df, pinverse, device.value)
+young.value = debias(young.value, "Wavy_Hair", df, pinverse, device.value)
+young.value = debias(young.value, "Chubby", df, pinverse, device.value)
+young.value = debias(young.value, "No_Beard", df, pinverse, device.value)
+young.value = debias(young.value, "Mustache", df, pinverse, device.value)
+gr.State(pointy) = get_direction(df, "Pointy_Nose", pinverse, 1000, device.value)
+pointy.value = debias(pointy.value, "Young", df, pinverse, device.value)
+pointy.value = debias(pointy.value, "Male", df, pinverse, device.value)
+pointy.value = debias(pointy.value, "Wavy_Hair", df, pinverse, device.value)
+pointy.value = debias(pointy.value, "Chubby", df, pinverse, device.value)
+pointy.value = debias(pointy.value, "Heavy_Makeup", df, pinverse, device.value)
+gr.State(wavy) = get_direction(df, "Wavy_Hair", pinverse, 1000, device.value)
+wavy.value = debias(wavy.value, "Young", df, pinverse, device.value)
+wavy.value = debias(wavy.value, "Male", df, pinverse, device.value)
+wavy.value = debias(wavy.value, "Pointy_Nose", df, pinverse, device.value)
+wavy.value = debias(wavy.value, "Chubby", df, pinverse, device.value)
+wavy.value = debias(wavy.value, "Heavy_Makeup", df, pinverse, device.value)
+gr.State(thick) = get_direction(df, "Bushy_Eyebrows", pinverse, 1000, device.value)
+thick.value = debias(thick.value, "Male", df, pinverse, device.value)
+thick.value = debias(thick.value, "Young", df, pinverse, device.value)
+thick.value = debias(thick.value, "Pointy_Nose", df, pinverse, device.value)
+thick.value = debias(thick.value, "Wavy_Hair", df, pinverse, device.value)
+thick.value = debias(thick.value, "Mustache", df, pinverse, device.value)
+thick.value = debias(thick.value, "No_Beard", df, pinverse, device.value)
+thick.value = debias(thick.value, "Sideburns", df, pinverse, device.value)
+thick.value = debias(thick.value, "Big_Nose", df, pinverse, device.value)
+thick.value = debias(thick.value, "Big_Lips", df, pinverse, device.value)
+thick.value = debias(thick.value, "Black_Hair", df, pinverse, device.value)
+thick.value = debias(thick.value, "Brown_Hair", df, pinverse, device.value)
+thick.value = debias(thick.value, "Pale_Skin", df, pinverse, device.value)
+thick.value = debias(thick.value, "Heavy_Makeup", df, pinverse, device.value)
 def sample_model():
+    unet.value, _, _, _, _ = load_models(device.value)
+    network.value = sample_weights(unet.value, proj.value, mean.value, std.value, v[:, :1000], device.value, factor = 1.00)
 @torch.no_grad()
 @spaces.GPU
 def inference( prompt, negative_prompt, guidance_scale, ddim_steps, seed):
+    generator = torch.Generator(device=device.value).manual_seed(seed)
     latents = torch.randn(
         (1, unet.in_channels, 512 // 8, 512 // 8),
         generator = generator,
+        device = device.value
     ).bfloat16()
+    text_input = tokenizer.value(prompt, padding="max_length", max_length=tokenizer.value.model_max_length, truncation=True, return_tensors="pt")
+    text_embeddings = text_encoder.value(text_input.input_ids.to(device))[0]
     max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer.value(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
+    uncond_embeddings = text_encoder.value(uncond_input.input_ids.to(device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    noise_scheduler.value.set_timesteps(ddim_steps)
+    latents = latents * noise_scheduler.value.init_noise_sigma
+    for i,t in enumerate(tqdm.tqdm(noise_scheduler.value.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = noise_scheduler.value.scale_model_input(latent_model_input, timestep=t)
+        with network.value:
+            noise_pred = unet.value(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
         latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
+    image = vae.value.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
 @torch.no_grad()
 @spaces.GPU
 def edit_inference(prompt, negative_prompt, guidance_scale, ddim_steps, seed, start_noise, a1, a2, a3, a4):
+    original_weights = network.value.proj.clone()
     #pad to same number of PCs
     pcs_original = original_weights.shape[1]
+    pcs_edits = young.value.shape[1]
     padding =  torch.zeros((1,pcs_original-pcs_edits)).to(device)
+    young_pad = torch.cat((young.value, padding), 1)
+    pointy_pad = torch.cat((pointy.value, padding), 1)
+    wavy_pad = torch.cat((wavy.value, padding), 1)
+    thick_pad = torch.cat((thick.value, padding), 1)
     edited_weights = original_weights+a1*1e6*young_pad+a2*1e6*pointy_pad+a3*1e6*wavy_pad+a4*2e6*thick_pad
+    generator = torch.Generator(device=device.value).manual_seed(seed)
     latents = torch.randn(
         (1, unet.in_channels, 512 // 8, 512 // 8),
         generator = generator,
+        device = device.value
     ).bfloat16()
+    text_input = tokenizer.value(prompt, padding="max_length", max_length=tokenizer.value.model_max_length, truncation=True, return_tensors="pt")
+    text_embeddings = text_encoder.value(text_input.input_ids.to(device))[0]
     max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer.value(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
+    uncond_embeddings = text_encoder.value(uncond_input.input_ids.to(device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    noise_scheduler.value.set_timesteps(ddim_steps)
+    latents = latents * noise_scheduler.value.init_noise_sigma
+    for i,t in enumerate(tqdm.tqdm(noise_scheduler.value.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = noise_scheduler.value.scale_model_input(latent_model_input, timestep=t)
         if t>start_noise:
             pass
         elif t<=start_noise:
+            network.value.proj = torch.nn.Parameter(edited_weights)
+            network.value.reset()
         with network:
+            noise_pred = unet.value(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        latents = noise_scheduler.value.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
+    image = vae.value.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
     image = Image.fromarray((image * 255).round().astype("uint8"))
     #reset weights back to original
+    network.value.proj = torch.nn.Parameter(original_weights)
+    network.value.reset()
     return image
     cfg = 3.0
     steps = 25
     image = inference( prompt, negative_prompt, cfg, steps, seed)
+    torch.save(network.value.proj, "model.pt" )
     return image, "model.pt"
 class CustomImageDataset(Dataset):
     def __init__(self, images, transform=None):