Spaces:

Ryukijano
/

Fastest-image-generation

Runtime error

App Files Files Community

Ryukijano commited on Dec 9, 2024

Commit

c8f2370

verified ·

1 Parent(s): f819c94

Update custom_pipeline.py

Browse files

Files changed (1) hide show

custom_pipeline.py +50 -22

custom_pipeline.py CHANGED Viewed

@@ -130,29 +130,57 @@ class FluxWithCFGPipeline(FluxPipeline):
         # Handle guidance
         guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float16).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
-        # 6. Denoising loop
-        for i, t in enumerate(timesteps):
-            if self.interrupt:
-                continue
-            timestep = t.expand(latents.shape[0]).to(latents.dtype)
-            noise_pred = self.transformer(
-                hidden_states=latents,
-                timestep=timestep / 1000,
-                guidance=guidance,
-                pooled_projections=pooled_prompt_embeds,
-                encoder_hidden_states=prompt_embeds,
-                txt_ids=text_ids,
-                img_ids=latent_image_ids,
-                joint_attention_kwargs=self.joint_attention_kwargs,
-                return_dict=False,
-            )[0]
-             # Yield intermediate result
-            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-            torch.cuda.empty_cache()
         # Final image
         return self._decode_latents_to_image(latents, height, width, output_type)

         # Handle guidance
         guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float16).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
+        # static method that can be jitted
+        @staticmethod
+        @torch.jit.script
+        def _denoising_loop_static(latents, timesteps, pooled_prompt_embeds, prompt_embeds, text_ids, latent_image_ids, guidance, transformer, scheduler):
+            for i, t in enumerate(timesteps):
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    return_dict=False,
+                )[0]
+                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                torch.cuda.empty_cache()
+            return latents
+        # Make the core denoising loop a static method
+        self._denoising_loop = torch.cuda.make_graphed_callables(
+            _denoising_loop_static,
+            (
+                latents.clone(),  # Example inputs for warmup
+                timesteps.clone(),
+                pooled_prompt_embeds.clone(),
+                prompt_embeds.clone(),
+                text_ids.clone(),
+                latent_image_ids.clone(),
+                guidance.clone(),
+                self.transformer,
+                self.scheduler
+            )
+        )
+        # Call the static method now
+        latents = self._denoising_loop(
+            latents,
+            timesteps,
+            pooled_prompt_embeds,
+            prompt_embeds,
+            text_ids,
+            latent_image_ids,
+            guidance,
+            self.transformer,
+            self.scheduler
+        )
         # Final image
         return self._decode_latents_to_image(latents, height, width, output_type)