Spaces:

alexnasa
/

Chain-of-Zoom

Running on Zero

App Files Files Community

alexnasa commited on 12 days ago

Commit

b488f86

verified ·

1 Parent(s): ba815e8

Update inference_coz_single.py

Browse files

Files changed (1) hide show

inference_coz_single.py +57 -25

inference_coz_single.py CHANGED Viewed

@@ -71,7 +71,7 @@ def _generate_vlm_prompt(
         return_tensors="pt",
     ).to(device)
-    # (4) Generate tokens→decode
     generated = vlm_model.generate(**inputs, max_new_tokens=128)
     # strip off the prompt tokens from each generated sequence:
     trimmed = [
@@ -86,28 +86,46 @@ def _generate_vlm_prompt(
 # -------------------------------------------------------------------
-# Main Function: recursive_multiscale_sr
 # -------------------------------------------------------------------
 def recursive_multiscale_sr(
     input_png_path: str,
     upscale: int,
-) -> list[Image.Image]:
     """
-    Perform exactly four recursive_multiscale super-resolution steps on a single PNG.
     - input_png_path: path to a single .png file on disk.
     - upscale: integer up-scale factor per recursion (e.g. 4).
-    Returns a list of 4 PIL.Image objects, corresponding to each SR output
-    at recursion steps 1, 2, 3, 4 (in that order).
-    All other parameters (model checkpoints, prompt model, process size, etc.)
-    are hard-coded exactly as in your command-line example.
     """
     ###############################
     # 1. Fixed hyper-parameters
     ###############################
     device = "cuda"
     process_size = 512     # same as args.process_size
-    rec_num = 4            # fixed to 4 recursions
     # model checkpoint paths (hard-coded to your example)
     LORA_PATH = "ckpt/SR_LoRA/model_20001.pkl"
     VAE_PATH  = "ckpt/SR_VAE/vae_encoder_20001.pt"
@@ -142,7 +160,7 @@ def recursive_multiscale_sr(
     ###############################
     # 3.1 Instantiate the underlying SD3-Euler UNet/VAE/text encoders
     sd3 = SD3Euler()
-    # move all text encoders+transformer+VAE to CUDA:
     sd3.text_enc_1.to(device)
     sd3.text_enc_2.to(device)
     sd3.text_enc_3.to(device)
@@ -163,7 +181,7 @@ def recursive_multiscale_sr(
     # (by default, “model_test(...)” takes (lq_tensor, prompt=str) and returns a list[tensor])
     ###############################
-    # 4. Load the VLM (Qwen2.5-VL)
     ###############################
     vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         VLM_NAME,
@@ -173,7 +191,7 @@ def recursive_multiscale_sr(
     vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
     ###############################
-    # 5. Pre-allocate a Temporary Directory
     #    to hold intermediate JPEG/PNG files
     ###############################
     unique_id = uuid.uuid4().hex
@@ -193,23 +211,37 @@ def recursive_multiscale_sr(
         prev_path = os.path.join(td, "step0_prev.png")
         img0.save(prev_path)
-        # We will maintain a list of PIL outputs here:
         sr_pil_list: list[Image.Image] = []
-        prompt_list = []
         ###############################
-        # 7. Recursion loop (exactly 4 times)
         ###############################
         for rec in range(rec_num):
-            # (A) Crop + upsample the “prev” image to obtain this step’s input → zoomed
             prev_pil = Image.open(prev_path).convert("RGB")
-            w, h = prev_pil.size                      # should be (512×512) each time
             new_w, new_h = w // upscale, h // upscale  # e.g. 128×128 for upscale=4
-            # center-crop region:
-            left = (w - new_w) // 2
-            top  = (h - new_h) // 2
             right = left + new_w
             bottom = top + new_h
             cropped = prev_pil.crop((left, top, right, bottom))
             # (B) Resize that crop back up to (512×512) via BICUBIC → zoomed
@@ -228,7 +260,7 @@ def recursive_multiscale_sr(
             )
             # (By default, no extra user prompt is appended.)
-            # (D) Prepare the low-res tensor for SR: convert zoomed→Tensor→[0,1]→[−1,1]
             to_tensor = transforms.ToTensor()
             lq = to_tensor(zoomed).unsqueeze(0).to(device)  # shape (1,3,512,512)
             lq = (lq * 2.0) - 1.0
@@ -252,7 +284,7 @@ def recursive_multiscale_sr(
         # end for(rec)
         ###############################
-        # 8. Return the four SR‐PILs
         ###############################
-        # The list sr_pil_list = [ SR1, SR2, SR3, SR4 ] in order.
-        return sr_pil_list, prompt_list

         return_tensors="pt",
     ).to(device)
+    # (4) Generate tokens → decode
     generated = vlm_model.generate(**inputs, max_new_tokens=128)
     # strip off the prompt tokens from each generated sequence:
     trimmed = [
 # -------------------------------------------------------------------
+# Main Function: recursive_multiscale_sr (with multiple centers)
 # -------------------------------------------------------------------
 def recursive_multiscale_sr(
     input_png_path: str,
     upscale: int,
+    rec_num: int = 4,
+    centers: list[tuple[float, float]] = None,
+) -> tuple[list[Image.Image], list[str]]:
     """
+    Perform `rec_num` recursive_multiscale super-resolution steps on a single PNG.
     - input_png_path: path to a single .png file on disk.
     - upscale: integer up-scale factor per recursion (e.g. 4).
+    - rec_num: how many recursion steps to perform.
+    - centers: a list of normalized (x, y) tuples in [0, 1], one per recursion step,
+               indicating where to center the low-res crop for each step. The list
+               length must equal rec_num. If centers is None, defaults to center=(0.5, 0.5)
+               for all steps.
+    Returns a tuple (sr_pil_list, prompt_list), where:
+      - sr_pil_list: list of PIL.Image outputs [SR1, SR2, …, SR_rec_num] in order.
+      - prompt_list: list of the VLM prompts generated at each recursion.
     """
+    ###############################
+    # 0. Validate / fill default centers
+    ###############################
+    if centers is None:
+        # Default: use center (0.5, 0.5) for every recursion
+        centers = [(0.5, 0.5) for _ in range(rec_num)]
+    else:
+        if not isinstance(centers, (list, tuple)) or len(centers) != rec_num:
+            raise ValueError(
+                f"`centers` must be a list of {rec_num} (x,y) tuples, but got length {len(centers)}."
+            )
     ###############################
     # 1. Fixed hyper-parameters
     ###############################
     device = "cuda"
     process_size = 512     # same as args.process_size
     # model checkpoint paths (hard-coded to your example)
     LORA_PATH = "ckpt/SR_LoRA/model_20001.pkl"
     VAE_PATH  = "ckpt/SR_VAE/vae_encoder_20001.pt"
     ###############################
     # 3.1 Instantiate the underlying SD3-Euler UNet/VAE/text encoders
     sd3 = SD3Euler()
+    # move all text encoders + transformer + VAE to CUDA:
     sd3.text_enc_1.to(device)
     sd3.text_enc_2.to(device)
     sd3.text_enc_3.to(device)
     # (by default, “model_test(...)” takes (lq_tensor, prompt=str) and returns a list[tensor])
     ###############################
+    # 4. Load the VLM (Qwen2.5-VL)
     ###############################
     vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         VLM_NAME,
     vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
     ###############################
+    # 5. Pre-allocate a Temporary Directory
     #    to hold intermediate JPEG/PNG files
     ###############################
     unique_id = uuid.uuid4().hex
         prev_path = os.path.join(td, "step0_prev.png")
         img0.save(prev_path)
+        # We will maintain lists of PIL outputs and prompts:
         sr_pil_list: list[Image.Image] = []
+        prompt_list: list[str] = []
         ###############################
+        # 7. Recursion loop (now up to rec_num times)
         ###############################
         for rec in range(rec_num):
+            # (A) Load the previous SR output (or original) and compute crop window
             prev_pil = Image.open(prev_path).convert("RGB")
+            w, h = prev_pil.size  # should be (512×512) each time
+            # (1) Compute the “low-res” window size:
             new_w, new_h = w // upscale, h // upscale  # e.g. 128×128 for upscale=4
+            # (2) Map normalized center → pixel center, then clamp so crop stays in bounds:
+            cx_norm, cy_norm = centers[rec]
+            cx = int(cx_norm * w)
+            cy = int(cy_norm * h)
+            half_w = new_w // 2
+            half_h = new_h // 2
+            # If center in pixels is too close to left/top, clamp so left=0 or top=0; same on right/bottom
+            left = cx - half_w
+            top = cy - half_h
+            # clamp left ∈ [0, w - new_w], top ∈ [0, h - new_h]
+            left = max(0, min(left, w - new_w))
+            top = max(0, min(top, h - new_h))
             right = left + new_w
             bottom = top + new_h
             cropped = prev_pil.crop((left, top, right, bottom))
             # (B) Resize that crop back up to (512×512) via BICUBIC → zoomed
             )
             # (By default, no extra user prompt is appended.)
+            # (D) Prepare the low-res tensor for SR: convert zoomed → Tensor → [0,1] → [−1,1]
             to_tensor = transforms.ToTensor()
             lq = to_tensor(zoomed).unsqueeze(0).to(device)  # shape (1,3,512,512)
             lq = (lq * 2.0) - 1.0
         # end for(rec)
         ###############################
+        # 8. Return the SR outputs & prompts
         ###############################
+        # The list sr_pil_list = [ SR1, SR2, …, SR_rec_num ] in order.
+        return sr_pil_list, prompt_list