Spaces:

alexnasa
/

XVerse

Running on Zero

App Files Files Community

alexnasa commited on 11 days ago

Commit

cebd833

verified ·

1 Parent(s): e7ad2ae

Update app.py

Browse files

Files changed (1) hide show

app.py +239 -241

app.py CHANGED Viewed

@@ -31,117 +31,117 @@ os.environ["NCCL_P2P_DISABLE"]="1"
 os.environ["NCCL_IB_DISABLE"]="1"
 import src.flux.generate
-# from src.flux.generate import generate_from_test_sample, seed_everything
-# from src.flux.pipeline_tools import CustomFluxPipeline, load_modulation_adapter, load_dit_lora
-# from src.utils.data_utils import get_train_config, image_grid, pil2tensor, json_dump, pad_to_square, cv2pil, merge_bboxes
-# from eval.tools.face_id import FaceID
-# from eval.tools.florence_sam import ObjectDetector
-# import shutil
-# import yaml
-# import numpy as np
-# from huggingface_hub import snapshot_download, hf_hub_download
 import torch
-# # FLUX.1-dev
-# snapshot_download(
-#     repo_id="black-forest-labs/FLUX.1-dev",
-#     local_dir="./checkpoints/FLUX.1-dev",
-#     local_dir_use_symlinks=False
-# )
-# # Florence-2-large
-# snapshot_download(
-#     repo_id="microsoft/Florence-2-large",
-#     local_dir="./checkpoints/Florence-2-large",
-#     local_dir_use_symlinks=False
-# )
-# # CLIP ViT Large
-# snapshot_download(
-#     repo_id="openai/clip-vit-large-patch14",
-#     local_dir="./checkpoints/clip-vit-large-patch14",
-#     local_dir_use_symlinks=False
-# )
-# # DINO ViT-s16
-# snapshot_download(
-#     repo_id="facebook/dino-vits16",
-#     local_dir="./checkpoints/dino-vits16",
-#     local_dir_use_symlinks=False
-# )
-# # mPLUG Visual Question Answering
-# snapshot_download(
-#     repo_id="xingjianleng/mplug_visual-question-answering_coco_large_en",
-#     local_dir="./checkpoints/mplug_visual-question-answering_coco_large_en",
-#     local_dir_use_symlinks=False
-# )
-# # XVerse
-# snapshot_download(
-#     repo_id="ByteDance/XVerse",
-#     local_dir="./checkpoints/XVerse",
-#     local_dir_use_symlinks=False
-# )
-# hf_hub_download(
-#     repo_id="facebook/sam2.1-hiera-large",
-#     local_dir="./checkpoints/",
-#     filename="sam2.1_hiera_large.pt",
-# )
-# os.environ["FLORENCE2_MODEL_PATH"]    = "./checkpoints/Florence-2-large"
-# os.environ["SAM2_MODEL_PATH"]         = "./checkpoints/sam2.1_hiera_large.pt"
-# os.environ["FACE_ID_MODEL_PATH"]      = "./checkpoints/model_ir_se50.pth"
-# os.environ["CLIP_MODEL_PATH"]         = "./checkpoints/clip-vit-large-patch14"
-# os.environ["FLUX_MODEL_PATH"]         = "./checkpoints/FLUX.1-dev"
-# os.environ["DPG_VQA_MODEL_PATH"]      = "./checkpoints/mplug_visual-question-answering_coco_large_en"
-# os.environ["DINO_MODEL_PATH"]         = "./checkpoints/dino-vits16"
-# dtype = torch.bfloat16
-# device = "cuda"
-# config_path = "train/config/XVerse_config_demo.yaml"
-# config = config_train = get_train_config(config_path)
-# # config["model"]["dit_quant"] = "int8-quanto"
-# config["model"]["use_dit_lora"] = False
-# model = CustomFluxPipeline(
-#     config, device, torch_dtype=dtype,
-# )
-# model.pipe.set_progress_bar_config(leave=False)
-# face_model = FaceID(device)
-# detector = ObjectDetector(device)
-# config = get_train_config(config_path)
-# model.config = config
-# run_mode = "mod_only" # orig_only, mod_only, both
-# store_attn_map = False
-# run_name = time.strftime("%m%d-%H%M")
-# num_inputs = 6
-# ckpt_root = "./checkpoints/XVerse"
-# model.clear_modulation_adapters()
-# model.pipe.unload_lora_weights()
-# if not os.path.exists(ckpt_root):
-#     print("Checkpoint root does not exist.")
-# modulation_adapter = load_modulation_adapter(model, config, dtype, device, f"{ckpt_root}/modulation_adapter", is_training=False)
-# model.add_modulation_adapter(modulation_adapter)
-# if config["model"]["use_dit_lora"]:
-#     load_dit_lora(model, model.pipe, config, dtype, device, f"{ckpt_root}", is_training=False)
-# vae_skip_iter = None
-# attn_skip_iter = 0
-# def clear_images():
-#     return [None, ]*num_inputs
 @spaces.GPU()
 def det_seg_img(image, label):
@@ -211,145 +211,143 @@ def generate_image(
     indexs,  # 新增参数
     # *images_captions_faces,  # Combine all unpacked arguments into one tuple
 ):
-    # torch.cuda.empty_cache()
-    # num_images = 1
-    # # Determine the number of images, captions, and faces based on the indexs length
-    # images = list(images_captions_faces[:num_inputs])
-    # captions = list(images_captions_faces[num_inputs:2 * num_inputs])
-    # idips_checkboxes = list(images_captions_faces[2 * num_inputs:3 * num_inputs])
-    # images = [images[i] for i in indexs]
-    # captions = [captions[i] for i in indexs]
-    # idips_checkboxes = [idips_checkboxes[i] for i in indexs]
-    # print(f"Length of images: {len(images)}")
-    # print(f"Length of captions: {len(captions)}")
-    # print(f"Indexs: {indexs}")
-    # print(f"Control weight lambda: {control_weight_lambda}")
-    # if control_weight_lambda != "no":
-    #     parts = control_weight_lambda.split(',')
-    #     new_parts = []
-    #     for part in parts:
-    #         if ':' in part:
-    #             left, right = part.split(':')
-    #             values = right.split('/')
-    #             # 保存整体值
-    #             global_value = values[0]
-    #             id_value = values[1]
-    #             ip_value = values[2]
-    #             new_values = [global_value]
-    #             for is_id in idips_checkboxes:
-    #                 if is_id:
-    #                     new_values.append(id_value)
-    #                 else:
-    #                     new_values.append(ip_value)
-    #             new_part = f"{left}:{('/'.join(new_values))}"
-    #             new_parts.append(new_part)
-    #         else:
-    #             new_parts.append(part)
-    #     control_weight_lambda = ','.join(new_parts)
-    # print(f"Control weight lambda: {control_weight_lambda}")
-    # src_inputs = []
-    # use_words = []
-    # cur_run_time = time.strftime("%m%d-%H%M%S")
-    # tmp_dir_root = f"tmp/gradio_demo/{run_name}"
-    # temp_dir = f"{tmp_dir_root}/{cur_run_time}_{generate_random_string(4)}"
-    # os.makedirs(temp_dir, exist_ok=True)
-    # print(f"Temporary directory created: {temp_dir}")
-    # for i, (image_path, caption) in enumerate(zip(images, captions)):
-    #     if image_path:
-    #         if caption.startswith("a ") or caption.startswith("A "):
-    #             word = caption[2:]
-    #         else:
-    #             word = caption
-    #         if f"ENT{i+1}" in prompt:
-    #             prompt = prompt.replace(f"ENT{i+1}", caption)
-    #         image = resize_keep_aspect_ratio(Image.open(image_path), 768)
-    #         save_path = f"{temp_dir}/tmp_resized_input_{i}.png"
-    #         image.save(save_path)
-    #         input_image_path = save_path
-    #         src_inputs.append(
-    #             {
-    #                 "image_path": input_image_path,
-    #                 "caption": caption
-    #             }
-    #         )
-    #         use_words.append((i, word, word))
-    # test_sample = dict(
-    #     input_images=[], position_delta=[0, -32],
-    #     prompt=prompt,
-    #     target_height=target_height,
-    #     target_width=target_width,
-    #     seed=seed,
-    #     cond_size=cond_size,
-    #     vae_skip_iter=vae_skip_iter,
-    #     lora_scale=ip_scale,
-    #     control_weight_lambda=control_weight_lambda,
-    #     latent_sblora_scale=latent_sblora_scale_str,
-    #     condition_sblora_scale=vae_lora_scale,
-    #     double_attention=double_attention,
-    #     single_attention=single_attention,
-    # )
-    # if len(src_inputs) > 0:
-    #     test_sample["modulation"] = [
-    #         dict(
-    #             type="adapter",
-    #             src_inputs=src_inputs,
-    #             use_words=use_words,
-    #         ),
-    #     ]
-    # json_dump(test_sample, f"{temp_dir}/test_sample.json", 'utf-8')
-    # assert single_attention == True
-    # target_size = int(round((target_width * target_height) ** 0.5) // 16 * 16)
-    # print(test_sample)
-    # model.config["train"]["dataset"]["val_condition_size"] = cond_size
-    # model.config["train"]["dataset"]["val_target_size"] = target_size
-    # if control_weight_lambda == "no":
-    #     control_weight_lambda = None
-    # if vae_skip_iter == "no":
-    #     vae_skip_iter = None
-    # use_condition_sblora_control = True
-    # use_latent_sblora_control = True
-    # image = generate_from_test_sample(
-    #     test_sample, model.pipe, model.config,
-    #     num_images=num_images,
-    #     target_height=target_height,
-    #     target_width=target_width,
-    #     seed=seed,
-    #     store_attn_map=store_attn_map,
-    #     vae_skip_iter=vae_skip_iter,  # 使用新的参数
-    #     control_weight_lambda=control_weight_lambda,  # 传递新的参数
-    #     double_attention=double_attention,  # 新增参数
-    #     single_attention=single_attention,  # 新增参数
-    #     ip_scale=ip_scale,
-    #     use_latent_sblora_control=use_latent_sblora_control,
-    #     latent_sblora_scale=latent_sblora_scale_str,
-    #     use_condition_sblora_control=use_condition_sblora_control,
-    #     condition_sblora_scale=vae_lora_scale,
-    # )
-    # if isinstance(image, list):
-    #     num_cols = 2
-    #     num_rows = int(math.ceil(num_images / num_cols))
-    #     image = image_grid(image, num_rows, num_cols)
-    # save_path = f"{temp_dir}/tmp_result.png"
-    # image.save(save_path)
-    # return image
-    return None
@@ -533,7 +531,7 @@ if __name__ == "__main__":
         )
         # # 修改清空函数的输出参数
-        # clear_btn.click(clear_images, outputs=images)
         face_btn_1.click(crop_face_img, inputs=[image_1], outputs=[image_1])
         det_btn_1.click(det_seg_img, inputs=[image_1, caption_1], outputs=[image_1])

 os.environ["NCCL_IB_DISABLE"]="1"
 import src.flux.generate
+from src.flux.generate import generate_from_test_sample, seed_everything
+from src.flux.pipeline_tools import CustomFluxPipeline, load_modulation_adapter, load_dit_lora
+from src.utils.data_utils import get_train_config, image_grid, pil2tensor, json_dump, pad_to_square, cv2pil, merge_bboxes
+from eval.tools.face_id import FaceID
+from eval.tools.florence_sam import ObjectDetector
+import shutil
+import yaml
+import numpy as np
+from huggingface_hub import snapshot_download, hf_hub_download
 import torch
+# FLUX.1-dev
+snapshot_download(
+    repo_id="black-forest-labs/FLUX.1-dev",
+    local_dir="./checkpoints/FLUX.1-dev",
+    local_dir_use_symlinks=False
+)
+# Florence-2-large
+snapshot_download(
+    repo_id="microsoft/Florence-2-large",
+    local_dir="./checkpoints/Florence-2-large",
+    local_dir_use_symlinks=False
+)
+# CLIP ViT Large
+snapshot_download(
+    repo_id="openai/clip-vit-large-patch14",
+    local_dir="./checkpoints/clip-vit-large-patch14",
+    local_dir_use_symlinks=False
+)
+# DINO ViT-s16
+snapshot_download(
+    repo_id="facebook/dino-vits16",
+    local_dir="./checkpoints/dino-vits16",
+    local_dir_use_symlinks=False
+)
+# mPLUG Visual Question Answering
+snapshot_download(
+    repo_id="xingjianleng/mplug_visual-question-answering_coco_large_en",
+    local_dir="./checkpoints/mplug_visual-question-answering_coco_large_en",
+    local_dir_use_symlinks=False
+)
+# XVerse
+snapshot_download(
+    repo_id="ByteDance/XVerse",
+    local_dir="./checkpoints/XVerse",
+    local_dir_use_symlinks=False
+)
+hf_hub_download(
+    repo_id="facebook/sam2.1-hiera-large",
+    local_dir="./checkpoints/",
+    filename="sam2.1_hiera_large.pt",
+)
+os.environ["FLORENCE2_MODEL_PATH"]    = "./checkpoints/Florence-2-large"
+os.environ["SAM2_MODEL_PATH"]         = "./checkpoints/sam2.1_hiera_large.pt"
+os.environ["FACE_ID_MODEL_PATH"]      = "./checkpoints/model_ir_se50.pth"
+os.environ["CLIP_MODEL_PATH"]         = "./checkpoints/clip-vit-large-patch14"
+os.environ["FLUX_MODEL_PATH"]         = "./checkpoints/FLUX.1-dev"
+os.environ["DPG_VQA_MODEL_PATH"]      = "./checkpoints/mplug_visual-question-answering_coco_large_en"
+os.environ["DINO_MODEL_PATH"]         = "./checkpoints/dino-vits16"
+dtype = torch.bfloat16
+device = "cuda"
+config_path = "train/config/XVerse_config_demo.yaml"
+config = config_train = get_train_config(config_path)
+# config["model"]["dit_quant"] = "int8-quanto"
+config["model"]["use_dit_lora"] = False
+model = CustomFluxPipeline(
+    config, device, torch_dtype=dtype,
+)
+model.pipe.set_progress_bar_config(leave=False)
+face_model = FaceID(device)
+detector = ObjectDetector(device)
+config = get_train_config(config_path)
+model.config = config
+run_mode = "mod_only" # orig_only, mod_only, both
+store_attn_map = False
+run_name = time.strftime("%m%d-%H%M")
+num_inputs = 6
+ckpt_root = "./checkpoints/XVerse"
+model.clear_modulation_adapters()
+model.pipe.unload_lora_weights()
+if not os.path.exists(ckpt_root):
+    print("Checkpoint root does not exist.")
+modulation_adapter = load_modulation_adapter(model, config, dtype, device, f"{ckpt_root}/modulation_adapter", is_training=False)
+model.add_modulation_adapter(modulation_adapter)
+if config["model"]["use_dit_lora"]:
+    load_dit_lora(model, model.pipe, config, dtype, device, f"{ckpt_root}", is_training=False)
+vae_skip_iter = None
+attn_skip_iter = 0
+def clear_images():
+    return [None, ]*num_inputs
 @spaces.GPU()
 def det_seg_img(image, label):
     indexs,  # 新增参数
     # *images_captions_faces,  # Combine all unpacked arguments into one tuple
 ):
+    torch.cuda.empty_cache()
+    num_images = 1
+    # Determine the number of images, captions, and faces based on the indexs length
+    images = list(images_captions_faces[:num_inputs])
+    captions = list(images_captions_faces[num_inputs:2 * num_inputs])
+    idips_checkboxes = list(images_captions_faces[2 * num_inputs:3 * num_inputs])
+    images = [images[i] for i in indexs]
+    captions = [captions[i] for i in indexs]
+    idips_checkboxes = [idips_checkboxes[i] for i in indexs]
+    print(f"Length of images: {len(images)}")
+    print(f"Length of captions: {len(captions)}")
+    print(f"Indexs: {indexs}")
+    print(f"Control weight lambda: {control_weight_lambda}")
+    if control_weight_lambda != "no":
+        parts = control_weight_lambda.split(',')
+        new_parts = []
+        for part in parts:
+            if ':' in part:
+                left, right = part.split(':')
+                values = right.split('/')
+                # 保存整体值
+                global_value = values[0]
+                id_value = values[1]
+                ip_value = values[2]
+                new_values = [global_value]
+                for is_id in idips_checkboxes:
+                    if is_id:
+                        new_values.append(id_value)
+                    else:
+                        new_values.append(ip_value)
+                new_part = f"{left}:{('/'.join(new_values))}"
+                new_parts.append(new_part)
+            else:
+                new_parts.append(part)
+        control_weight_lambda = ','.join(new_parts)
+    print(f"Control weight lambda: {control_weight_lambda}")
+    src_inputs = []
+    use_words = []
+    cur_run_time = time.strftime("%m%d-%H%M%S")
+    tmp_dir_root = f"tmp/gradio_demo/{run_name}"
+    temp_dir = f"{tmp_dir_root}/{cur_run_time}_{generate_random_string(4)}"
+    os.makedirs(temp_dir, exist_ok=True)
+    print(f"Temporary directory created: {temp_dir}")
+    for i, (image_path, caption) in enumerate(zip(images, captions)):
+        if image_path:
+            if caption.startswith("a ") or caption.startswith("A "):
+                word = caption[2:]
+            else:
+                word = caption
+            if f"ENT{i+1}" in prompt:
+                prompt = prompt.replace(f"ENT{i+1}", caption)
+            image = resize_keep_aspect_ratio(Image.open(image_path), 768)
+            save_path = f"{temp_dir}/tmp_resized_input_{i}.png"
+            image.save(save_path)
+            input_image_path = save_path
+            src_inputs.append(
+                {
+                    "image_path": input_image_path,
+                    "caption": caption
+                }
+            )
+            use_words.append((i, word, word))
+    test_sample = dict(
+        input_images=[], position_delta=[0, -32],
+        prompt=prompt,
+        target_height=target_height,
+        target_width=target_width,
+        seed=seed,
+        cond_size=cond_size,
+        vae_skip_iter=vae_skip_iter,
+        lora_scale=ip_scale,
+        control_weight_lambda=control_weight_lambda,
+        latent_sblora_scale=latent_sblora_scale_str,
+        condition_sblora_scale=vae_lora_scale,
+        double_attention=double_attention,
+        single_attention=single_attention,
+    )
+    if len(src_inputs) > 0:
+        test_sample["modulation"] = [
+            dict(
+                type="adapter",
+                src_inputs=src_inputs,
+                use_words=use_words,
+            ),
+        ]
+    json_dump(test_sample, f"{temp_dir}/test_sample.json", 'utf-8')
+    assert single_attention == True
+    target_size = int(round((target_width * target_height) ** 0.5) // 16 * 16)
+    print(test_sample)
+    model.config["train"]["dataset"]["val_condition_size"] = cond_size
+    model.config["train"]["dataset"]["val_target_size"] = target_size
+    if control_weight_lambda == "no":
+        control_weight_lambda = None
+    if vae_skip_iter == "no":
+        vae_skip_iter = None
+    use_condition_sblora_control = True
+    use_latent_sblora_control = True
+    image = generate_from_test_sample(
+        test_sample, model.pipe, model.config,
+        num_images=num_images,
+        target_height=target_height,
+        target_width=target_width,
+        seed=seed,
+        store_attn_map=store_attn_map,
+        vae_skip_iter=vae_skip_iter,  # 使用新的参数
+        control_weight_lambda=control_weight_lambda,  # 传递新的参数
+        double_attention=double_attention,  # 新增参数
+        single_attention=single_attention,  # 新增参数
+        ip_scale=ip_scale,
+        use_latent_sblora_control=use_latent_sblora_control,
+        latent_sblora_scale=latent_sblora_scale_str,
+        use_condition_sblora_control=use_condition_sblora_control,
+        condition_sblora_scale=vae_lora_scale,
+    )
+    if isinstance(image, list):
+        num_cols = 2
+        num_rows = int(math.ceil(num_images / num_cols))
+        image = image_grid(image, num_rows, num_cols)
+    save_path = f"{temp_dir}/tmp_result.png"
+    image.save(save_path)
+    return image
         )
         # # 修改清空函数的输出参数
+        clear_btn.click(clear_images, outputs=images)
         face_btn_1.click(crop_face_img, inputs=[image_1], outputs=[image_1])
         det_btn_1.click(det_seg_img, inputs=[image_1, caption_1], outputs=[image_1])