Spaces:

MohamedRashad
/

PaintsUndo

Running on Zero

App Files Files Community

MohamedRashad commited on Jul 10, 2024

Commit

2f3fed1

1 Parent(s): 453686a

chore: Update CUDA device usage in app.py

Browse files

Files changed (2) hide show

app.py +7 -23
memory_management.py +0 -67

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import gradio as gr
 import numpy as np
 import torch
 import wd14tagger
-import memory_management
 import uuid
 from PIL import Image
@@ -37,9 +36,9 @@ class ModifiedUNet(UNet2DConditionModel):
 model_name = 'lllyasviel/paints_undo_single_frame'
 tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
-text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16)
-vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16)  # bfloat16 vae
-unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16)
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
@@ -47,12 +46,7 @@ vae.set_attn_processor(AttnProcessor2_0())
 video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
     'lllyasviel/paints_undo_multi_frame',
     fp16=True
-)
-memory_management.unload_all_models([
-    video_pipe.unet, video_pipe.vae, video_pipe.text_encoder, video_pipe.image_projection, video_pipe.image_encoder,
-    unet, vae, text_encoder
-])
 k_sampler = KDiffusionSampler(
     unet=unet,
@@ -76,7 +70,6 @@ def find_best_bucket(h, w, options):
 @torch.inference_mode()
 def encode_cropped_prompt_77tokens(txt: str):
-    memory_management.load_models_to_gpu(text_encoder)
     cond_ids = tokenizer(txt,
                          padding="max_length",
                          max_length=tokenizer.model_max_length,
@@ -111,28 +104,25 @@ def resize_without_crop(image, target_width, target_height):
 @torch.inference_mode()
-@spaces.GPU(duration=360)
 def interrogator_process(x):
     image_description = wd14tagger.default_interrogator(x)
     return image_description, image_description
 @torch.inference_mode()
-@spaces.GPU(duration=360)
 def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
             progress=gr.Progress()):
-    rng = torch.Generator(device=memory_management.gpu).manual_seed(int(seed))
-    memory_management.load_models_to_gpu(vae)
     fg = resize_and_center_crop(input_fg, image_width, image_height)
     concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
-    memory_management.load_models_to_gpu(text_encoder)
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
-    memory_management.load_models_to_gpu(unet)
     fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
     concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
@@ -150,7 +140,6 @@ def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed,
         progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
     ).to(vae.dtype) / vae.config.scaling_factor
-    memory_management.load_models_to_gpu(vae)
     pixels = vae.decode(latents).sample
     pixels = pytorch2numpy(pixels)
     pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
@@ -177,25 +166,21 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
     input_frames = numpy2pytorch([image_1, image_2])
     input_frames = input_frames.unsqueeze(0).movedim(1, 2)
-    memory_management.load_models_to_gpu(video_pipe.text_encoder)
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
-    memory_management.load_models_to_gpu([video_pipe.image_projection, video_pipe.image_encoder])
     input_frames = input_frames.to(device=video_pipe.image_encoder.device, dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
-    memory_management.load_models_to_gpu([video_pipe.vae])
     input_frames = input_frames.to(device=video_pipe.vae.device, dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]
     concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
-    memory_management.load_models_to_gpu([video_pipe.unet])
     latents = video_pipe(
         batch_size=1,
         steps=int(steps),
@@ -209,7 +194,6 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
         progress_tqdm=progress_tqdm
     )
-    memory_management.load_models_to_gpu([video_pipe.vae])
     video = video_pipe.decode_latents(latents, vae_hidden_states)
     return video, image_1, image_2

 import numpy as np
 import torch
 import wd14tagger
 import uuid
 from PIL import Image
 model_name = 'lllyasviel/paints_undo_single_frame'
 tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16).to("cuda")
+vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16).to("cuda")  # bfloat16 vae
+unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16).to("cuda")
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
 video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
     'lllyasviel/paints_undo_multi_frame',
     fp16=True
+).to("cuda")
 k_sampler = KDiffusionSampler(
     unet=unet,
 @torch.inference_mode()
 def encode_cropped_prompt_77tokens(txt: str):
     cond_ids = tokenizer(txt,
                          padding="max_length",
                          max_length=tokenizer.model_max_length,
 @torch.inference_mode()
+@spaces.GPU()
 def interrogator_process(x):
     image_description = wd14tagger.default_interrogator(x)
     return image_description, image_description
 @torch.inference_mode()
+@spaces.GPU()
 def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
             progress=gr.Progress()):
+    rng = torch.Generator(device="cuda").manual_seed(int(seed))
     fg = resize_and_center_crop(input_fg, image_width, image_height)
     concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
     fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
     concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
         progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
     ).to(vae.dtype) / vae.config.scaling_factor
     pixels = vae.decode(latents).sample
     pixels = pytorch2numpy(pixels)
     pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
     input_frames = numpy2pytorch([image_1, image_2])
     input_frames = input_frames.unsqueeze(0).movedim(1, 2)
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
     input_frames = input_frames.to(device=video_pipe.image_encoder.device, dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
     input_frames = input_frames.to(device=video_pipe.vae.device, dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]
     concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
     latents = video_pipe(
         batch_size=1,
         steps=int(steps),
         progress_tqdm=progress_tqdm
     )
     video = video_pipe.decode_latents(latents, vae_hidden_states)
     return video, image_1, image_2

memory_management.py DELETED Viewed

@@ -1,67 +0,0 @@
-import torch
-from contextlib import contextmanager
-high_vram = False
-gpu = torch.device('cuda')
-cpu = torch.device('cpu')
-torch.zeros((1, 1)).to(gpu, torch.float32)
-torch.cuda.empty_cache()
-models_in_gpu = []
-@contextmanager
-def movable_bnb_model(m):
-    if hasattr(m, 'quantization_method'):
-        m.quantization_method_backup = m.quantization_method
-        del m.quantization_method
-    try:
-        yield None
-    finally:
-        if hasattr(m, 'quantization_method_backup'):
-            m.quantization_method = m.quantization_method_backup
-            del m.quantization_method_backup
-    return
-def load_models_to_gpu(models):
-    global models_in_gpu
-    if not isinstance(models, (tuple, list)):
-        models = [models]
-    models_to_remain = [m for m in set(models) if m in models_in_gpu]
-    models_to_load = [m for m in set(models) if m not in models_in_gpu]
-    models_to_unload = [m for m in set(models_in_gpu) if m not in models_to_remain]
-    if not high_vram:
-        for m in models_to_unload:
-            with movable_bnb_model(m):
-                m.to(cpu)
-            print('Unload to CPU:', m.__class__.__name__)
-        models_in_gpu = models_to_remain
-    for m in models_to_load:
-        with movable_bnb_model(m):
-            m.to(gpu)
-        print('Load to GPU:', m.__class__.__name__)
-    models_in_gpu = list(set(models_in_gpu + models))
-    torch.cuda.empty_cache()
-    return
-def unload_all_models(extra_models=None):
-    global models_in_gpu
-    if extra_models is None:
-        extra_models = []
-    if not isinstance(extra_models, (tuple, list)):
-        extra_models = [extra_models]
-    models_in_gpu = list(set(models_in_gpu + extra_models))
-    return load_models_to_gpu([])