Spaces:

nyanko7
/

flux1-dev-nf4

Running on Zero

App Files Files Community

nyanko7 commited on Aug 12, 2024

Commit

e11ace5

verified ·

1 Parent(s): 33db744

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -29

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
-import os
 import spaces
-from dataclasses import dataclass
 import gradio as gr
 import torch
@@ -22,6 +21,7 @@ from torch import Tensor, nn
 from transformers import CLIPTextModel, CLIPTokenizer
 from transformers import T5EncoderModel, T5Tokenizer
 from safetensors.torch import load_file
 # from optimum.quanto import freeze, qfloat8, quantize
@@ -216,18 +216,27 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
     q, k = apply_rope(q, k, pe)
     x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
-    x = rearrange(x, "B H L D -> B L (H D)")
     return x
-def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
-    assert dim % 2 == 0
     scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
-    omega = 1.0 / (theta**scale)
-    out = torch.einsum("...n,d->...nd", pos, omega)
-    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
-    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
     return out.float()
@@ -267,9 +276,12 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
     """
     t = time_factor * t
     half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
-        t.device
-    )
     args = t[:, None].float() * freqs[None]
     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
@@ -327,7 +339,10 @@ class SelfAttention(nn.Module):
     def forward(self, x: Tensor, pe: Tensor) -> Tensor:
         qkv = self.qkv(x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         q, k = self.norm(q, k, v)
         x = attention(q, k, v, pe=pe)
         x = self.proj(x)
@@ -394,14 +409,20 @@ class DoubleStreamBlock(nn.Module):
         img_modulated = self.img_norm1(img)
         img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
         img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
         # prepare txt for attention
         txt_modulated = self.txt_norm1(txt)
         txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
         txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
         # run actual attention
@@ -460,7 +481,9 @@ class SingleStreamBlock(nn.Module):
         x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
         qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         q, k = self.norm(q, k, v)
         # compute attention
@@ -677,9 +700,7 @@ def denoise(
             timesteps=t_vec,
             guidance=guidance_vec,
         )
         img = img + (t_prev - t_curr) * pred
     return img
@@ -723,7 +744,7 @@ from safetensors.torch import load_file
 sd = load_file(hf_hub_download(repo_id="lllyasviel/flux1-dev-bnb-nf4", filename="flux1-dev-bnb-nf4.safetensors"))
 sd = {k.replace("model.diffusion_model.", ""): v for k, v in sd.items() if "model.diffusion_model" in k}
-model = Flux().to(dtype=torch.float16, device="cuda")
 result = model.load_state_dict(sd)
 print(result)
@@ -731,7 +752,7 @@ print(result)
 # result = model.load_state_dict(load_file("/storage/dev/nyanko/flux-dev/flux1-dev.sft"))
 @spaces.GPU
-@torch.inference_mode()
 def generate_image(
     prompt, width, height, guidance, seed,
     do_img2img, init_image, image2image_strength, resize_img,
@@ -742,7 +763,7 @@ def generate_image(
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch_device = torch.device(device)
     global model
     model = model.to(torch_device)
@@ -761,7 +782,7 @@ def generate_image(
     generator = torch.Generator(device=device).manual_seed(seed)
     x = torch.randn(1, 16, 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), device=device, dtype=torch.bfloat16, generator=generator)
-    num_steps = 25
     timesteps = get_schedule(num_steps, (x.shape[-1] * x.shape[-2]) // 4, shift=True)
     if do_img2img and init_image is not None:
@@ -770,13 +791,16 @@ def generate_image(
         timesteps = timesteps[t_idx:]
         x = t * x + (1.0 - t) * init_image.to(x.dtype)
-    with torch_device:
-        inp = prepare(t5=t5, clip=clip, img=x, prompt=prompt)
-        x = denoise(model, **inp, timesteps=timesteps, guidance=guidance)
-        x = unpack(x.float(), height, width)
-        with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
-            x = x = (x / ae.config.scaling_factor) + ae.config.shift_factor
-            x = ae.decode(x).sample
     x = x.clamp(-1, 1)
     x = rearrange(x[0], "c h w -> h w c")

+# import os
 import spaces
 import gradio as gr
 import torch
 from transformers import CLIPTextModel, CLIPTokenizer
 from transformers import T5EncoderModel, T5Tokenizer
 from safetensors.torch import load_file
+# from torch.profiler import profile, record_function, ProfilerActivity
 # from optimum.quanto import freeze, qfloat8, quantize
     q, k = apply_rope(q, k, pe)
     x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    # x = rearrange(x, "B H L D -> B L (H D)")
+    x = x.permute(0, 2, 1, 3).contiguous().reshape(x.size(0), x.size(2), -1)
     return x
+def rope(pos, dim, theta):
     scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta ** scale)
+    # out = torch.einsum("...n,d->...nd", pos, omega)
+    out = pos.unsqueeze(-1) * omega.unsqueeze(0)
+    cos_out = torch.cos(out)
+    sin_out = torch.sin(out)
+    out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+    # out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    b, n, d, _ = out.shape
+    out = out.view(b, n, d, 2, 2)
     return out.float()
     """
     t = time_factor * t
     half = dim // 2
+    # Do not block CUDA steam, but having about 1e-4 differences with Flux official codes:
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+    # Block CUDA steam, but consistent with official codes:
+    # freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
     args = t[:, None].float() * freqs[None]
     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
     def forward(self, x: Tensor, pe: Tensor) -> Tensor:
         qkv = self.qkv(x)
+        # q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = qkv.shape
+        qkv = qkv.view(B, L, 3, self.num_heads, -1)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
         q, k = self.norm(q, k, v)
         x = attention(q, k, v, pe=pe)
         x = self.proj(x)
         img_modulated = self.img_norm1(img)
         img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
         img_qkv = self.img_attn.qkv(img_modulated)
+         # img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = img_qkv.shape
+        H = self.num_heads
+        D = img_qkv.shape[-1] // (3 * H)
+        img_q, img_k, img_v = img_qkv.view(B, L, 3, H, D).permute(2, 0, 3, 1, 4)
         img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
         # prepare txt for attention
         txt_modulated = self.txt_norm1(txt)
         txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
         txt_qkv = self.txt_attn.qkv(txt_modulated)
+        # txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = txt_qkv.shape
+        txt_q, txt_k, txt_v = txt_qkv.view(B, L, 3, H, D).permute(2, 0, 3, 1, 4)
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
         # run actual attention
         x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
         qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        # q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        qkv = qkv.view(qkv.size(0), qkv.size(1), 3, self.num_heads, self.hidden_size // self.num_heads)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
         q, k = self.norm(q, k, v)
         # compute attention
             timesteps=t_vec,
             guidance=guidance_vec,
         )
         img = img + (t_prev - t_curr) * pred
     return img
 sd = load_file(hf_hub_download(repo_id="lllyasviel/flux1-dev-bnb-nf4", filename="flux1-dev-bnb-nf4.safetensors"))
 sd = {k.replace("model.diffusion_model.", ""): v for k, v in sd.items() if "model.diffusion_model" in k}
+model = Flux().to(dtype=torch.bfloat16, device="cuda")
 result = model.load_state_dict(sd)
 print(result)
 # result = model.load_state_dict(load_file("/storage/dev/nyanko/flux-dev/flux1-dev.sft"))
 @spaces.GPU
+@torch.no_grad()
 def generate_image(
     prompt, width, height, guidance, seed,
     do_img2img, init_image, image2image_strength, resize_img,
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch_device = torch.device(device)
     global model
     model = model.to(torch_device)
     generator = torch.Generator(device=device).manual_seed(seed)
     x = torch.randn(1, 16, 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), device=device, dtype=torch.bfloat16, generator=generator)
+    num_steps = 20
     timesteps = get_schedule(num_steps, (x.shape[-1] * x.shape[-2]) // 4, shift=True)
     if do_img2img and init_image is not None:
         timesteps = timesteps[t_idx:]
         x = t * x + (1.0 - t) * init_image.to(x.dtype)
+    inp = prepare(t5=t5, clip=clip, img=x, prompt=prompt)
+    x = denoise(model, **inp, timesteps=timesteps, guidance=guidance)
+    # with profile(activities=[ProfilerActivity.CPU],record_shapes=True,profile_memory=True) as prof:
+    # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
+    x = unpack(x.float(), height, width)
+    with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
+        x = x = (x / ae.config.scaling_factor) + ae.config.shift_factor
+        x = ae.decode(x).sample
     x = x.clamp(-1, 1)
     x = rearrange(x[0], "c h w -> h w c")