Spaces:

MohamedRashad
/

PaintsUndo

Running on Zero

App Files Files Community

MohamedRashad commited on Jul 10, 2024

Commit

8f812c4

1 Parent(s): 2f3fed1

Update CUDA device usage in app.py

Browse files

Files changed (1) hide show

app.py +6 -6

app.py CHANGED Viewed

@@ -74,7 +74,7 @@ def encode_cropped_prompt_77tokens(txt: str):
                          padding="max_length",
                          max_length=tokenizer.model_max_length,
                          truncation=True,
-                         return_tensors="pt").input_ids.to(device=text_encoder.device)
     text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
     return text_cond
@@ -117,15 +117,15 @@ def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed,
     rng = torch.Generator(device="cuda").manual_seed(int(seed))
     fg = resize_and_center_crop(input_fg, image_width, image_height)
-    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
-    fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
-    concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
     latents = k_sampler(
         initial_latent=initial_latents,
         strength=1.0,
@@ -169,13 +169,13 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
-    input_frames = input_frames.to(device=video_pipe.image_encoder.device, dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
-    input_frames = input_frames.to(device=video_pipe.vae.device, dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]

                          padding="max_length",
                          max_length=tokenizer.model_max_length,
                          truncation=True,
+                         return_tensors="pt").input_ids.to(device="cuda")
     text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
     return text_cond
     rng = torch.Generator(device="cuda").manual_seed(int(seed))
     fg = resize_and_center_crop(input_fg, image_width, image_height)
+    concat_conds = numpy2pytorch([fg]).to(device="cuda", dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
+    fs = torch.tensor(input_undo_steps).to(device="cuda", dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
+    concat_conds = concat_conds.to(device="cuda", dtype=unet.dtype)
     latents = k_sampler(
         initial_latent=initial_latents,
         strength=1.0,
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
+    input_frames = input_frames.to(device="cuda", dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
+    input_frames = input_frames.to(device="cuda", dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]