Spaces:

TongkunGuan
/

Token-level_Text_Image_Foundation_Model

Running

App Files Files Community

TongkunGuan commited on Mar 11

Commit

b6f1806

verified ·

1 Parent(s): 77bca7f

Update app.py

Browse files

Files changed (1) hide show

app.py +228 -72

app.py CHANGED Viewed

@@ -1,3 +1,222 @@
 import os
 import argparse
 import numpy as np
@@ -23,12 +242,9 @@ CHECKPOINTS = {
 HF_TOKEN = os.getenv("HF_TOKEN")
 current_vis = []
 current_bpe = []
-current_index = 0
 def load_model(check_type):
-    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    device = torch.device("cuda")
     if check_type == 'R50':
         tokenizer = load_tokenizer('tokenizer_path')
         model = build_model(argparse.Namespace()).eval()
@@ -55,7 +271,7 @@ def load_model(check_type):
     return model.to(device), tokenizer, transform, device
 def process_image(model, tokenizer, transform, device, check_type, image, text):
-    global current_vis, current_bpe, current_index
     src_size = image.size
     if 'TokenOCR' in check_type:
         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
@@ -80,10 +296,6 @@ def process_image(model, tokenizer, transform, device, check_type, image, text):
             text_embeds = model.tok_embeddings(input_ids)
         vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
-        print("vit_embeds",vit_embeds)
-        print("vit_embeds,shape",vit_embeds.shape)
-        print("target_ratio",target_ratio)
-        print("check_type",check_type)
         vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
         # 计算相似度
@@ -92,43 +304,20 @@ def process_image(model, tokenizer, transform, device, check_type, image, text):
         similarity = text_embeds @ vit_embeds.T
         resized_size = size1 if size1 is not None else size2
-    # print(f"text_embeds shape: {text_embeds.shape}, numel: {text_embeds.numel()}") # text_embeds shape: torch.Size([4, 2048]), numel: 8192
-    # print(f"vit_embeds shape: {vit_embeds.shape}, numel: {vit_embeds.numel()}") # vit_embeds shape: torch.Size([9728, 2048]), numel: 19922944
-    # print(f"similarity shape: {similarity.shape}, numel: {similarity.numel()}")# similarity shape: torch.Size([4, 9728]), numel: 38912
     # 生成可视化
     attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
-    # attn_map = similarity.reshape(len(text_embeds), *target_ratio)
     all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
     current_vis = generate_similiarity_map([image], attn_map,
                                            [tokenizer.decode([i]) for i in input_ids],
                                            [], target_ratio, src_size)
     current_bpe = [tokenizer.decode([i]) for i in input_ids]
-    # current_bpe[-1] = 'Input text'
     current_bpe[-1] = text
-    print("current_vis",len(current_vis))
-    print("current_bpe",len(current_bpe))
-    return image, current_vis[0], current_bpe[0]
-# 事件处理函数
-def update_index(change):
-    global current_vis, current_bpe, current_index
-    current_index = max(0, min(len(current_vis) - 1, current_index + change))
-    return current_vis[current_index], format_bpe_display(current_bpe[current_index])
 def format_bpe_display(bpe):
-    # 使用HTML标签来设置字体大小、颜色，加粗，并居中
     return f"<div style='text-align:center; font-size:20px;'><strong>Current BPE: <span style='color:red;'>{bpe}</span></strong></div>"
-def update_slider_index(x):
-    print(f"x: {x}, current_vis length: {len(current_vis)}, current_bpe length: {len(current_bpe)}")
-    if 0 <= x < len(current_vis) and 0 <= x < len(current_bpe):
-        return current_vis[x], format_bpe_display(current_bpe[x])
-    else:
-        return None, "索引超出范围"
 # Gradio界面
 with gr.Blocks(title="BPE Visualization Demo") as demo:
     gr.Markdown("## BPE Visualization Demo - TokenFD基座模型能力可视化")
@@ -138,7 +327,7 @@ with gr.Blocks(title="BPE Visualization Demo") as demo:
             model_type = gr.Dropdown(
                 choices=["TokenFD_4096_English_seg", "TokenFD_2048_Bilingual_seg", "R50", "R50_siglip"],
                 label="Select model type",
-                value="TokenOCR_4096_English_seg"  # 设置默认值为第一个选项
             )
             image_input = gr.Image(label="Upload images", type="pil")
             text_input = gr.Textbox(label="Input text")
@@ -162,57 +351,24 @@ with gr.Blocks(title="BPE Visualization Demo") as demo:
                 orig_img = gr.Image(label="Original picture", interactive=False)
                 heatmap = gr.Image(label="BPE visualization", interactive=False)
-            with gr.Row() as controls:
-                prev_btn = gr.Button("⬅ Last", visible=False)
-                index_slider = gr.Slider(0, 1, value=0, step=1, label="BPE index", visible=False)
-                next_btn = gr.Button("⮕ Next", visible=False)
             bpe_display = gr.Markdown("Current BPE: ", visible=False)
     # 事件处理
     @spaces.GPU
     def on_run_clicked(model_type, image, text):
-        global current_vis, current_bpe, current_index
-        current_index = 0  # Reset index when new image is processed
         image, vis, bpe = process_image(*load_model(model_type), model_type, image, text)
-        # Update the slider range and set value to 0
-        slider_max_val = len(current_bpe) - 1
         bpe_text = format_bpe_display(bpe)
-        print("current_vis",len(current_vis))
-        print("current_bpe",len(current_bpe))
-        return image, vis, bpe_text, slider_max_val
     run_btn.click(
         on_run_clicked,
         inputs=[model_type, image_input, text_input],
-        outputs=[orig_img, heatmap, bpe_display, index_slider],
     ).then(
-        lambda max_val: (gr.update(visible=True), gr.update(visible=True, maximum=max_val, value=0), gr.update(visible=True), gr.update(visible=True)),
-        inputs=index_slider,
-        outputs=[prev_btn, index_slider, next_btn, bpe_display],
     )
-    prev_btn.click(
-        lambda: (*update_index(-1), current_index),
-        outputs=[heatmap, bpe_display, index_slider]
-    )
-    next_btn.click(
-        lambda: (*update_index(1), current_index),
-        outputs=[heatmap, bpe_display, index_slider]
-    )
-    # index_slider.change(
-    #     lambda x: (current_vis[x], format_bpe_display(current_bpe[x])) if 0<=x<len(current_vis else (None,"Invaild")
-    #     inputs=index_slider,
-    #     outputs=[heatmap, bpe_display]
-    # )
-    index_slider.change(
-            update_slider_index,
-            inputs=index_slider,
-            outputs=[heatmap, bpe_display]
-        )
 if __name__ == "__main__":
     demo.launch()

+# import os
+# import argparse
+# import numpy as np
+# from PIL import Image
+# import torch
+# import torchvision.transforms as T
+# from transformers import AutoTokenizer
+# import gradio as gr
+# from resnet50 import build_model
+# from utils import generate_similiarity_map, post_process, load_tokenizer, build_transform_R50
+# from utils import IMAGENET_MEAN, IMAGENET_STD
+# from internvl.train.dataset import dynamic_preprocess
+# from internvl.model.internvl_chat import InternVLChatModel
+# import spaces
+# # 模型配置
+# CHECKPOINTS = {
+#     "TokenFD_4096_English_seg": "TongkunGuan/TokenFD_4096_English_seg",
+#     "TokenFD_2048_Bilingual_seg": "TongkunGuan/TokenFD_2048_Bilingual_seg",
+# }
+# # 全局变量
+# HF_TOKEN = os.getenv("HF_TOKEN")
+# current_vis = []
+# current_bpe = []
+# current_index = 0
+# def load_model(check_type):
+#     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#     device = torch.device("cuda")
+#     if check_type == 'R50':
+#         tokenizer = load_tokenizer('tokenizer_path')
+#         model = build_model(argparse.Namespace()).eval()
+#         model.load_state_dict(torch.load(CHECKPOINTS['R50'], map_location='cpu')['model'])
+#         transform = build_transform_R50(normalize_type='imagenet')
+#     elif check_type == 'R50_siglip':
+#         tokenizer = load_tokenizer('tokenizer_path')
+#         model = build_model(argparse.Namespace()).eval()
+#         model.load_state_dict(torch.load(CHECKPOINTS['R50_siglip'], map_location='cpu')['model'])
+#         transform = build_transform_R50(normalize_type='imagenet')
+#     elif 'TokenFD' in check_type:
+#         model_path = CHECKPOINTS[check_type]
+#         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False, use_auth_token=HF_TOKEN)
+#         model = InternVLChatModel.from_pretrained(model_path, torch_dtype=torch.bfloat16).eval()
+#         transform = T.Compose([
+#             T.Lambda(lambda img: img.convert('RGB')),
+#             T.Resize((224, 224)),
+#             T.ToTensor(),
+#             T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+#         ])
+#     return model.to(device), tokenizer, transform, device
+# def process_image(model, tokenizer, transform, device, check_type, image, text):
+#     global current_vis, current_bpe, current_index
+#     src_size = image.size
+#     if 'TokenOCR' in check_type:
+#         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
+#                                                   image_size=model.config.force_image_size,
+#                                                   use_thumbnail=model.config.use_thumbnail,
+#                                                   return_ratio=True)
+#         pixel_values = torch.stack([transform(img) for img in images]).to(device)
+#     else:
+#         pixel_values = torch.stack([transform(image)]).to(device)
+#         target_ratio = (1, 1)
+#     # 文本处理
+#     text += ' '
+#     input_ids = tokenizer(text)['input_ids'][1:]
+#     input_ids = torch.tensor(input_ids, device=device)
+#     # 获取嵌入
+#     with torch.no_grad():
+#         if 'R50' in check_type:
+#             text_embeds = model.language_embedding(input_ids)
+#         else:
+#             text_embeds = model.tok_embeddings(input_ids)
+#         vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
+#         print("vit_embeds",vit_embeds)
+#         print("vit_embeds,shape",vit_embeds.shape)
+#         print("target_ratio",target_ratio)
+#         print("check_type",check_type)
+#         vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
+#         # 计算相似度
+#         text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+#         vit_embeds = vit_embeds / vit_embeds.norm(dim=-1, keepdim=True)
+#         similarity = text_embeds @ vit_embeds.T
+#         resized_size = size1 if size1 is not None else size2
+#     # print(f"text_embeds shape: {text_embeds.shape}, numel: {text_embeds.numel()}") # text_embeds shape: torch.Size([4, 2048]), numel: 8192
+#     # print(f"vit_embeds shape: {vit_embeds.shape}, numel: {vit_embeds.numel()}") # vit_embeds shape: torch.Size([9728, 2048]), numel: 19922944
+#     # print(f"similarity shape: {similarity.shape}, numel: {similarity.numel()}")# similarity shape: torch.Size([4, 9728]), numel: 38912
+#     # 生成可视化
+#     attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
+#     # attn_map = similarity.reshape(len(text_embeds), *target_ratio)
+#     all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
+#     current_vis = generate_similiarity_map([image], attn_map,
+#                                            [tokenizer.decode([i]) for i in input_ids],
+#                                            [], target_ratio, src_size)
+#     current_bpe = [tokenizer.decode([i]) for i in input_ids]
+#     # current_bpe[-1] = 'Input text'
+#     current_bpe[-1] = text
+#     print("current_vis",len(current_vis))
+#     print("current_bpe",len(current_bpe))
+#     return image, current_vis[0], current_bpe[0]
+# # 事件处理函数
+# def update_index(change):
+#     global current_vis, current_bpe, current_index
+#     current_index = max(0, min(len(current_vis) - 1, current_index + change))
+#     return current_vis[current_index], format_bpe_display(current_bpe[current_index])
+# def format_bpe_display(bpe):
+#     # 使用HTML标签来设置字体大小、颜色，加粗，并居中
+#     return f"<div style='text-align:center; font-size:20px;'><strong>Current BPE: <span style='color:red;'>{bpe}</span></strong></div>"
+# def update_slider_index(x):
+#     print(f"x: {x}, current_vis length: {len(current_vis)}, current_bpe length: {len(current_bpe)}")
+#     if 0 <= x < len(current_vis) and 0 <= x < len(current_bpe):
+#         return current_vis[x], format_bpe_display(current_bpe[x])
+#     else:
+#         return None, "索引超出范围"
+# # Gradio界面
+# with gr.Blocks(title="BPE Visualization Demo") as demo:
+#     gr.Markdown("## BPE Visualization Demo - TokenFD基座模型能力可视化")
+#     with gr.Row():
+#         with gr.Column(scale=0.5):
+#             model_type = gr.Dropdown(
+#                 choices=["TokenFD_4096_English_seg", "TokenFD_2048_Bilingual_seg", "R50", "R50_siglip"],
+#                 label="Select model type",
+#                 value="TokenOCR_4096_English_seg"  # 设置默认值为第一个选项
+#             )
+#             image_input = gr.Image(label="Upload images", type="pil")
+#             text_input = gr.Textbox(label="Input text")
+#             run_btn = gr.Button("RUN")
+#             gr.Examples(
+#                 examples=[
+#                     [os.path.join("examples", "examples0.jpg"), "Veterans and Benefits"],
+#                     [os.path.join("examples", "examples1.jpg"), "Refreshers"],
+#                     [os.path.join("examples", "examples2.png"), "Vision Transformer"]
+#                 ],
+#                 inputs=[image_input, text_input],
+#                 label="Sample input"
+#             )
+#         with gr.Column(scale=2):
+#             gr.Markdown("<p style='font-size:20px;'><span style='color:red;'>If the input text is not included in the image</span>, the attention map will show a lot of noise (the actual response value is very low), since we normalize the attention map according to the relative value.</p>")
+#             with gr.Row():
+#                 orig_img = gr.Image(label="Original picture", interactive=False)
+#                 heatmap = gr.Image(label="BPE visualization", interactive=False)
+#             with gr.Row() as controls:
+#                 prev_btn = gr.Button("⬅ Last", visible=False)
+#                 index_slider = gr.Slider(0, 1, value=0, step=1, label="BPE index", visible=False)
+#                 next_btn = gr.Button("⮕ Next", visible=False)
+#             bpe_display = gr.Markdown("Current BPE: ", visible=False)
+#     # 事件处理
+#     @spaces.GPU
+#     def on_run_clicked(model_type, image, text):
+#         global current_vis, current_bpe, current_index
+#         current_index = 0  # Reset index when new image is processed
+#         image, vis, bpe = process_image(*load_model(model_type), model_type, image, text)
+#         # Update the slider range and set value to 0
+#         slider_max_val = len(current_bpe) - 1
+#         bpe_text = format_bpe_display(bpe)
+#         print("current_vis",len(current_vis))
+#         print("current_bpe",len(current_bpe))
+#         return image, vis, bpe_text, slider_max_val
+#     run_btn.click(
+#         on_run_clicked,
+#         inputs=[model_type, image_input, text_input],
+#         outputs=[orig_img, heatmap, bpe_display, index_slider],
+#     ).then(
+#         lambda max_val: (gr.update(visible=True), gr.update(visible=True, maximum=max_val, value=0), gr.update(visible=True), gr.update(visible=True)),
+#         inputs=index_slider,
+#         outputs=[prev_btn, index_slider, next_btn, bpe_display],
+#     )
+#     prev_btn.click(
+#         lambda: (*update_index(-1), current_index),
+#         outputs=[heatmap, bpe_display, index_slider]
+#     )
+#     next_btn.click(
+#         lambda: (*update_index(1), current_index),
+#         outputs=[heatmap, bpe_display, index_slider]
+#     )
+#     # index_slider.change(
+#     #     lambda x: (current_vis[x], format_bpe_display(current_bpe[x])) if 0<=x<len(current_vis else (None,"Invaild")
+#     #     inputs=index_slider,
+#     #     outputs=[heatmap, bpe_display]
+#     # )
+#     index_slider.change(
+#             update_slider_index,
+#             inputs=index_slider,
+#             outputs=[heatmap, bpe_display]
+#         )
+# if __name__ == "__main__":
+#     demo.launch()
 import os
 import argparse
 import numpy as np
 HF_TOKEN = os.getenv("HF_TOKEN")
 current_vis = []
 current_bpe = []
 def load_model(check_type):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if check_type == 'R50':
         tokenizer = load_tokenizer('tokenizer_path')
         model = build_model(argparse.Namespace()).eval()
     return model.to(device), tokenizer, transform, device
 def process_image(model, tokenizer, transform, device, check_type, image, text):
+    global current_vis, current_bpe
     src_size = image.size
     if 'TokenOCR' in check_type:
         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
             text_embeds = model.tok_embeddings(input_ids)
         vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
         vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
         # 计算相似度
         similarity = text_embeds @ vit_embeds.T
         resized_size = size1 if size1 is not None else size2
     # 生成可视化
     attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
     all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
     current_vis = generate_similiarity_map([image], attn_map,
                                            [tokenizer.decode([i]) for i in input_ids],
                                            [], target_ratio, src_size)
     current_bpe = [tokenizer.decode([i]) for i in input_ids]
     current_bpe[-1] = text
+    return image, current_vis, current_bpe
 def format_bpe_display(bpe):
     return f"<div style='text-align:center; font-size:20px;'><strong>Current BPE: <span style='color:red;'>{bpe}</span></strong></div>"
 # Gradio界面
 with gr.Blocks(title="BPE Visualization Demo") as demo:
     gr.Markdown("## BPE Visualization Demo - TokenFD基座模型能力可视化")
             model_type = gr.Dropdown(
                 choices=["TokenFD_4096_English_seg", "TokenFD_2048_Bilingual_seg", "R50", "R50_siglip"],
                 label="Select model type",
+                value="TokenOCR_4096_English_seg"
             )
             image_input = gr.Image(label="Upload images", type="pil")
             text_input = gr.Textbox(label="Input text")
                 orig_img = gr.Image(label="Original picture", interactive=False)
                 heatmap = gr.Image(label="BPE visualization", interactive=False)
             bpe_display = gr.Markdown("Current BPE: ", visible=False)
     # 事件处理
     @spaces.GPU
     def on_run_clicked(model_type, image, text):
+        global current_vis, current_bpe
         image, vis, bpe = process_image(*load_model(model_type), model_type, image, text)
         bpe_text = format_bpe_display(bpe)
+        return image, vis[0], bpe_text
     run_btn.click(
         on_run_clicked,
         inputs=[model_type, image_input, text_input],
+        outputs=[orig_img, heatmap, bpe_display],
     ).then(
+        lambda: (gr.update(visible=True)),
+        outputs=[bpe_display],
     )
 if __name__ == "__main__":
     demo.launch()