Spaces:

fbnnb
/

TC_1024

Runtime error

App Files Files Community

fbnnb commited on Dec 25, 2024

Commit

349445d

verified ·

1 Parent(s): 418cea4

Update gradio_app.py

Browse files

Files changed (1) hide show

gradio_app.py +120 -133

gradio_app.py CHANGED Viewed

@@ -126,141 +126,123 @@ def untranspose(tensor):
     return tensor.transpose(ndim-1, ndim-2)
 @spaces.GPU(duration=200)
-def get_image(image, sketch, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, control_scale=0.6):
-    print("enter fn")
-    # control_frames = extract_frames(frame_guides)
-    print("extract frames")
     seed_everything(seed)
-    transform = transforms.Compose([
-        transforms.Resize(min(resolution)),
-        transforms.CenterCrop(resolution),
-        ])
-    print("before empty cache")
-    torch.cuda.empty_cache()
-    print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
-    start = time.time()
-    gpu_id=0
-    if steps > 60:
-        steps = 60
-    global model
-    # model = model_list[gpu_id]
-    model = model.cuda()
-    batch_size=1
-    channels = model.model.diffusion_model.out_channels
-    frames = model.temporal_length
-    h, w = resolution[0] // 8, resolution[1] // 8
-    noise_shape = [batch_size, channels, frames, h, w]
-    # text cond
-    transposed = False
-    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float16):
-        text_emb = model.get_learned_conditioning([prompt])
-        print("before control")
-        #control cond
-        # if frame_guides is not None:
-        #     cn_videos = []
-        #     for frame in control_frames:
-        #         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        #         frame = cv2.bitwise_not(frame)
-        #         cn_tensor = torch.from_numpy(frame).unsqueeze(2).permute(2, 0, 1).float().to(model.device)
-        #         #cn_tensor = (cn_tensor / 255. - 0.5) * 2
-        #         cn_tensor = ( cn_tensor/255.0 )
-        #         cn_tensor = transpose_if_needed(cn_tensor)
-        #         cn_tensor_resized = transform(cn_tensor) #3,h,w
-        #         cn_video = cn_tensor_resized.unsqueeze(0).unsqueeze(2) # bc1hw
-        #         cn_videos.append(cn_video)
-        #     cn_videos = torch.cat(cn_videos, dim=2)
-        #     if cn_videos.shape[2] > frames:
-        #         idxs = []
-        #         for i in range(frames):
-        #             index = int((i + 0.5) * cn_videos.shape[2] / frames)
-        #             idxs.append(min(index, cn_videos.shape[2] - 1))
-        #         cn_videos = cn_videos[:, :, idxs, :, :]
-        #         print("cn_videos.shape after slicing", cn_videos.shape)
-        #     model_list = []
-        #     for model in model_list:
-        #         model.control_scale = control_scale
-        #         model_list.append(model)
-        # else:
-        cn_videos = None
-        print("image cond")
-        # img cond
-        img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
-        input_h, input_w = img_tensor.shape[1:]
-        img_tensor = (img_tensor / 255. - 0.5) * 2
-        img_tensor = transpose_if_needed(img_tensor)
-        image_tensor_resized = transform(img_tensor) #3,h,w
-        videos = image_tensor_resized.unsqueeze(0).unsqueeze(2) # bc1hw
-        print("get latent z")
-        # z = get_latent_z(model, videos) #bc,1,hw
-        videos = repeat(videos, 'b c t h w -> b c (repeat t) h w', repeat=frames//2)
-        if sketch is not None:
-            img_tensor2 = torch.from_numpy(sketch).permute(2, 0, 1).float().to(model.device)
-            img_tensor2 = (img_tensor2 / 255. - 0.5) * 2
-            img_tensor2 = transpose_if_needed(img_tensor2)
-            image_tensor_resized2 = transform(img_tensor2) #3,h,w
-            videos2 = image_tensor_resized2.unsqueeze(0).unsqueeze(2) # bchw
-            videos2 = repeat(videos2, 'b c t h w -> b c (repeat t) h w', repeat=frames//2)
-            videos = torch.cat([videos, videos2], dim=2)
         else:
-            videos = torch.cat([videos, videos], dim=2)
-        z, hs = get_latent_z_with_hidden_states(model, videos)
-        img_tensor_repeat = torch.zeros_like(z)
-        img_tensor_repeat[:,:,:1,:,:] = z[:,:,:1,:,:]
-        img_tensor_repeat[:,:,-1:,:,:] = z[:,:,-1:,:,:]
-        print("image embedder")
-        cond_images = model.embedder(img_tensor.unsqueeze(0)) ## blc
-        img_emb = model.image_proj_model(cond_images)
-        imtext_cond = torch.cat([text_emb, img_emb], dim=1)
-        fs = torch.tensor([fs], dtype=torch.long, device=model.device)
-        # print("cn videos:",cn_videos.shape, "img emb:", img_emb.shape)
-        cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat], "control_cond": cn_videos}
-        print("before sample loop")
-        ## inference
-        batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale, hs=hs)
-        ## remove the last frame
-        # if image2 is None:
-        batch_samples = batch_samples[:,:,:,:-1,...]
-        ## b,samples,c,t,h,w
-        prompt_str = prompt.replace("/", "_slash_") if "/" in prompt else prompt
-        prompt_str = prompt_str.replace(" ", "_") if " " in prompt else prompt_str
-        prompt_str=prompt_str[:40]
-        if len(prompt_str) == 0:
-            prompt_str = 'empty_prompt'
-    global result_dir
-    global save_fps
-    if input_h > input_w:
-        batch_samples = untranspose(batch_samples)
-    save_videos(batch_samples, result_dir, filenames=[prompt_str], fps=save_fps)
-    print(f"Saved in {prompt_str}. Time used: {(time.time() - start):.2f} seconds")
-    model = model.cpu()
-    saved_result_dir = os.path.join(result_dir, f"{prompt_str}.mp4")
-    print("result saved to:", saved_result_dir)
-    return saved_result_dir
-    # @spaces.GPU
@@ -314,17 +296,22 @@ def dynamicrafter_demo(result_dir='./tmp/', res=1024):
                         i2v_end_btn = gr.Button("Generate")
                     with gr.Column():
                         with gr.Row():
-                            i2v_input_sketch = gr.Image(label="Input End SKetch",elem_id="input_img2")
                         with gr.Row():
                             i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
                 gr.Examples(examples=i2v_examples_interp_1024,
-                            inputs=[i2v_input_image, i2v_input_sketch, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, control_scale],
                             outputs=[i2v_output_video],
                             fn = get_image,
                             cache_examples=False,
                 )
-            i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_sketch, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, control_scale],
                             outputs=[i2v_output_video],
                             fn = get_image
             )

     return tensor.transpose(ndim-1, ndim-2)
 @spaces.GPU(duration=200)
+def image_guided_synthesis(model, prompts, image1, image2, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1., \
+                        unconditional_guidance_scale=1.0, cfg_img=None, fs=None, seed=123, text_input=False, multiple_cond_cfg=False, \
+                           loop=False, interp=False, timestep_spacing='uniform', guidance_rescale=0.0, **kwargs):
     seed_everything(seed)
+    # image1 = Image.open(file_list[2*idx]).convert('RGB')
+    image_tensor1 = transform(image1).unsqueeze(1) # [c,1,h,w]
+    # image2 = Image.open(file_list[2*idx+1]).convert('RGB')
+    image_tensor2 = transform(image2).unsqueeze(1) # [c,1,h,w]
+    frame_tensor1 = repeat(image_tensor1, 'c t h w -> c (repeat t) h w', repeat=15)
+    frame_tensor2 = repeat(image_tensor2, 'c t h w -> c (repeat t) h w', repeat=1)
+    videos = torch.cat([frame_tensor1, frame_tensor2], dim=1)
+    # frame_tensor = torch.cat([frame_tensor1, frame_tensor1], dim=1)
+    # _, filename = os.path.split(file_list[idx*2])
+    ddim_sampler = DDIMSampler(model) if not multiple_cond_cfg else DDIMSampler_multicond(model)
+    batch_size = noise_shape[0]
+    fs = torch.tensor([fs] * batch_size, dtype=torch.long, device=model.device)
+    if not text_input:
+        prompts = [""]*batch_size
+    img = videos[:,:,0] #bchw
+    img_emb = model.embedder(img) ## blc
+    img_emb = model.image_proj_model(img_emb)
+    cond_emb = model.get_learned_conditioning(prompts)
+    cond = {"c_crossattn": [torch.cat([cond_emb,img_emb], dim=1)]}
+    if model.model.conditioning_key == 'hybrid':
+        z, hs = get_latent_z_with_hidden_states(model, videos) # b c t h w
+        if loop or interp:
+            img_cat_cond = torch.zeros_like(z)
+            img_cat_cond[:,:,0,:,:] = z[:,:,0,:,:]
+            img_cat_cond[:,:,-1,:,:] = z[:,:,-1,:,:]
         else:
+            img_cat_cond = z[:,:,:1,:,:]
+            img_cat_cond = repeat(img_cat_cond, 'b c t h w -> b c (repeat t) h w', repeat=z.shape[2])
+        cond["c_concat"] = [img_cat_cond] # b c 1 h w
+    if unconditional_guidance_scale != 1.0:
+        if model.uncond_type == "empty_seq":
+            prompts = batch_size * [""]
+            uc_emb = model.get_learned_conditioning(prompts)
+        elif model.uncond_type == "zero_embed":
+            uc_emb = torch.zeros_like(cond_emb)
+        uc_img_emb = model.embedder(torch.zeros_like(img)) ## b l c
+        uc_img_emb = model.image_proj_model(uc_img_emb)
+        uc = {"c_crossattn": [torch.cat([uc_emb,uc_img_emb],dim=1)]}
+        if model.model.conditioning_key == 'hybrid':
+            uc["c_concat"] = [img_cat_cond]
+    else:
+        uc = None
+#
+    # for i, h in enumerate(hs):
+        # print("h:", h.shape)
+        # hs[i] = hs[i][:,:,0,:,:].unsqueeze(2)
+    additional_decode_kwargs = {'ref_context': hs}
+    # additional_decode_kwargs = {'ref_context': None}
+    ## we need one more unconditioning image=yes, text=""
+    if multiple_cond_cfg and cfg_img != 1.0:
+        uc_2 = {"c_crossattn": [torch.cat([uc_emb,img_emb],dim=1)]}
+        if model.model.conditioning_key == 'hybrid':
+            uc_2["c_concat"] = [img_cat_cond]
+        kwargs.update({"unconditional_conditioning_img_nonetext": uc_2})
+    else:
+        kwargs.update({"unconditional_conditioning_img_nonetext": None})
+    z0 = None
+    cond_mask = None
+    batch_variants = []
+    for _ in range(n_samples):
+        if z0 is not None:
+            cond_z0 = z0.clone()
+            kwargs.update({"clean_cond": True})
+        else:
+            cond_z0 = None
+        if ddim_sampler is not None:
+            samples, _ = ddim_sampler.sample(S=ddim_steps,
+                                            conditioning=cond,
+                                            batch_size=batch_size,
+                                            shape=noise_shape[1:],
+                                            verbose=False,
+                                            unconditional_guidance_scale=unconditional_guidance_scale,
+                                            unconditional_conditioning=uc,
+                                            eta=ddim_eta,
+                                            cfg_img=cfg_img,
+                                            mask=cond_mask,
+                                            x0=cond_z0,
+                                            fs=fs,
+                                            timestep_spacing=timestep_spacing,
+                                            guidance_rescale=guidance_rescale,
+                                            **kwargs
+                                            )
+        ## reconstruct from latent to pixel space
+        batch_images = model.decode_first_stage(samples, **additional_decode_kwargs)
+        index = list(range(samples.shape[2]))
+        del index[1]
+        del index[-2]
+        samples = samples[:,:,index,:,:]
+        ## reconstruct from latent to pixel space
+        batch_images_middle = model.decode_first_stage(samples, **additional_decode_kwargs)
+        batch_images[:,:,batch_images.shape[2]//2-1:batch_images.shape[2]//2+1] = batch_images_middle[:,:,batch_images.shape[2]//2-2:batch_images.shape[2]//2]
+        batch_variants.append(batch_images)
+    ## variants, batch, c, t, h, w
+    batch_variants = torch.stack(batch_variants)
+    return batch_variants.permute(1, 0, 2, 3, 4, 5)
                         i2v_end_btn = gr.Button("Generate")
                     with gr.Column():
                         with gr.Row():
+                            i2v_input_image2 = gr.Image(label="Input Image 2",elem_id="input_img2")
                         with gr.Row():
                             i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
+                # s(model, prompts, image1, image2, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1., \
+                #         unconditional_guidance_scale=1.0, cfg_img=None, fs=None, text_input=False, multiple_cond_cfg=False, \
+                #            loop=False, interp=False, timestep_spacing='uniform', guidance_rescale=0.0, **kwargs):
                 gr.Examples(examples=i2v_examples_interp_1024,
+                            inputs=inputs=[i2v_input_image, i2v_input_text, i2v_input_image, i2v_input_image2, [72, 108], 1, i2v_steps, i2v_eta, 1.0, None, i2v_motion, i2v_seed],
                             outputs=[i2v_output_video],
                             fn = get_image,
                             cache_examples=False,
                 )
+            i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_input_image, i2v_input_image2, [72, 108], 1, i2v_steps, i2v_eta, 1.0, None, i2v_motion, i2v_seed],
                             outputs=[i2v_output_video],
                             fn = get_image
             )