Spaces:

TongkunGuan
/

Token-level_Text_Image_Foundation_Model

Running

App Files Files Community

TongkunGuan commited on Mar 12

Commit

464ed7e

verified ·

1 Parent(s): 79d5e07

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -47

app.py CHANGED Viewed

@@ -21,6 +21,10 @@ CHECKPOINTS = {
 # 全局变量
 HF_TOKEN = os.getenv("HF_TOKEN")
 def load_model(check_type):
     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -50,7 +54,8 @@ def load_model(check_type):
     return model.to(device), tokenizer, transform, device
-def process_image(model, tokenizer, transform, device, check_type, image, text, state):
     src_size = image.size
     if 'TokenOCR' in check_type:
         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
@@ -75,6 +80,10 @@ def process_image(model, tokenizer, transform, device, check_type, image, text,
             text_embeds = model.tok_embeddings(input_ids)
         vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
         vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
         # 计算相似度
@@ -83,22 +92,47 @@ def process_image(model, tokenizer, transform, device, check_type, image, text,
         similarity = text_embeds @ vit_embeds.T
         resized_size = size1 if size1 is not None else size2
     attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
     all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
-    vis = generate_similiarity_map([image], attn_map,
-                                   [tokenizer.decode([i]) for i in input_ids],
-                                   [], target_ratio, src_size)
-    bpe = [tokenizer.decode([i]) for i in input_ids]
-    bpe[-1] = text
-    # Store results in state
-    state['current_vis'] = vis
-    state['current_bpe'] = bpe
-    return image, vis[0], bpe[0], len(vis) - 1
 # Gradio界面
-with gr.Blocks() as demo:
     gr.Markdown("## BPE Visualization Demo - TokenFD基座模型能力可视化")
     with gr.Row():
@@ -106,11 +140,13 @@ with gr.Blocks() as demo:
             model_type = gr.Dropdown(
                 choices=["TokenFD_4096_English_seg", "TokenFD_2048_Bilingual_seg", "R50", "R50_siglip"],
                 label="Select model type",
-                value="TokenOCR_4096_English_seg"
             )
             image_input = gr.Image(label="Upload images", type="pil")
             text_input = gr.Textbox(label="Input text")
             run_btn = gr.Button("RUN")
             gr.Examples(
                 examples=[
                     [os.path.join("examples", "examples0.jpg"), "Veterans and Benefits"],
@@ -123,58 +159,62 @@ with gr.Blocks() as demo:
         with gr.Column(scale=2):
             gr.Markdown("<p style='font-size:20px;'><span style='color:red;'>If the input text is not included in the image</span>, the attention map will show a lot of noise (the actual response value is very low), since we normalize the attention map according to the relative value.</p>")
-            orig_img = gr.Image(label="Original picture", interactive=False)
-            heatmap = gr.Image(label="BPE visualization", interactive=False)
-            prev_btn = gr.Button("⬅ Last", visible=False)
-            index_slider = gr.Slider(0, 1, value=0, step=1, label="BPE index", visible=False)
-            next_btn = gr.Button("⮕ Next", visible=False)
-            bpe_display = gr.Markdown("Current BPE: ", visible=False)
-    state = gr.State()
-    state['current_vis'] = []
-    state['current_bpe'] = []
-    state['current_index'] = 0
     @spaces.GPU
-    def on_run_clicked(model_type, image, text, state):
-        image, vis, bpe, slider_max_val = process_image(*load_model(model_type), model_type, image, text, state)
-        state['current_vis'] = vis
-        state['current_bpe'] = bpe
-        state['current_index'] = 0
         bpe_text = format_bpe_display(bpe)
         return image, vis, bpe_text, slider_max_val
     run_btn.click(
         on_run_clicked,
-        inputs=[model_type, image_input, text_input, state],
-        outputs=[orig_img, heatmap, bpe_display],
-        _js="""
-            (orig_img, heatmap, bpe_display, slider_max_val) => {
-                index_slider.update({ visible: true, maximum: slider_max_val, value: 0 });
-                prev_btn.update({ visible: true });
-                next_btn.update({ visible: true });
-                return [orig_img, heatmap, bpe_display];
-            }
-        """
     )
     prev_btn.click(
-        lambda state: update_index(-1, state),
-        inputs=[state],
         outputs=[heatmap, bpe_display, index_slider]
     )
     next_btn.click(
-        lambda state: update_index(1, state),
-        inputs=[state],
         outputs=[heatmap, bpe_display, index_slider]
     )
     index_slider.change(
-        lambda x, state: update_slider_index(x, state),
-        inputs=[index_slider, state],
-        outputs=[heatmap, bpe_display]
-    )
 if __name__ == "__main__":
     demo.launch()

 # 全局变量
 HF_TOKEN = os.getenv("HF_TOKEN")
+current_vis = []
+current_bpe = []
+current_index = 0
 def load_model(check_type):
     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     return model.to(device), tokenizer, transform, device
+def process_image(model, tokenizer, transform, device, check_type, image, text):
+    global current_vis, current_bpe, current_index
     src_size = image.size
     if 'TokenOCR' in check_type:
         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
             text_embeds = model.tok_embeddings(input_ids)
         vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
+        print("vit_embeds",vit_embeds)
+        print("vit_embeds,shape",vit_embeds.shape)
+        print("target_ratio",target_ratio)
+        print("check_type",check_type)
         vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
         # 计算相似度
         similarity = text_embeds @ vit_embeds.T
         resized_size = size1 if size1 is not None else size2
+    # print(f"text_embeds shape: {text_embeds.shape}, numel: {text_embeds.numel()}") # text_embeds shape: torch.Size([4, 2048]), numel: 8192
+    # print(f"vit_embeds shape: {vit_embeds.shape}, numel: {vit_embeds.numel()}") # vit_embeds shape: torch.Size([9728, 2048]), numel: 19922944
+    # print(f"similarity shape: {similarity.shape}, numel: {similarity.numel()}")# similarity shape: torch.Size([4, 9728]), numel: 38912
+    # 生成可视化
     attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
+    # attn_map = similarity.reshape(len(text_embeds), *target_ratio)
     all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
+    current_vis = generate_similiarity_map([image], attn_map,
+                                           [tokenizer.decode([i]) for i in input_ids],
+                                           [], target_ratio, src_size)
+    current_bpe = [tokenizer.decode([i]) for i in input_ids]
+    # current_bpe[-1] = 'Input text'
+    current_bpe[-1] = text
+    return image, current_vis[0], current_bpe[0]
+# 事件处理函数
+def update_index(change):
+    global current_vis, current_bpe, current_index
+    current_index = max(0, min(len(current_vis) - 1, current_index + change))
+    return current_vis[current_index], format_bpe_display(current_bpe[current_index])
+def format_bpe_display(bpe):
+    # 使用HTML标签来设置字体大小、颜色，加粗，并居中
+    return f"<div style='text-align:center; font-size:20px;'><strong>Current BPE: <span style='color:red;'>{bpe}</span></strong></div>"
+def update_slider_index(x):
+    global current_vis, current_bpe, current_index
+    print(f"x: {x}, current_vis length: {len(current_vis)}, current_bpe length: {len(current_bpe)}")
+    if 0 <= x < len(current_vis) and 0 <= x < len(current_bpe):
+        return current_vis[x], format_bpe_display(current_bpe[x])
+    else:
+        return None, "索引超出范围"
 # Gradio界面
+with gr.Blocks(title="BPE Visualization Demo") as demo:
     gr.Markdown("## BPE Visualization Demo - TokenFD基座模型能力可视化")
     with gr.Row():
             model_type = gr.Dropdown(
                 choices=["TokenFD_4096_English_seg", "TokenFD_2048_Bilingual_seg", "R50", "R50_siglip"],
                 label="Select model type",
+                value="TokenOCR_4096_English_seg"  # 设置默认值为第一个选项
             )
             image_input = gr.Image(label="Upload images", type="pil")
             text_input = gr.Textbox(label="Input text")
             run_btn = gr.Button("RUN")
             gr.Examples(
                 examples=[
                     [os.path.join("examples", "examples0.jpg"), "Veterans and Benefits"],
         with gr.Column(scale=2):
             gr.Markdown("<p style='font-size:20px;'><span style='color:red;'>If the input text is not included in the image</span>, the attention map will show a lot of noise (the actual response value is very low), since we normalize the attention map according to the relative value.</p>")
+            with gr.Row():
+                orig_img = gr.Image(label="Original picture", interactive=False)
+                heatmap = gr.Image(label="BPE visualization", interactive=False)
+            with gr.Row() as controls:
+                prev_btn = gr.Button("⬅ Last", visible=False)
+                index_slider = gr.Slider(0, 1, value=0, step=1, label="BPE index", visible=False)
+                next_btn = gr.Button("⮕ Next", visible=False)
+            bpe_display = gr.Markdown("Current BPE: ", visible=False)
+    # 事件处理
     @spaces.GPU
+    def on_run_clicked(model_type, image, text):
+        global current_vis, current_bpe, current_index
+        current_index = 0  # Reset index when new image is processed
+        image, vis, bpe = process_image(*load_model(model_type), model_type, image, text)
+        # Update the slider range and set value to 0
+        slider_max_val = len(current_bpe) - 1
         bpe_text = format_bpe_display(bpe)
+        print("len_current_vis",len(current_vis))
+        print("len_current_bpe",len(current_bpe))
+        print("current_vis",current_vis)
+        print("current_bpe",current_bpe)
         return image, vis, bpe_text, slider_max_val
     run_btn.click(
         on_run_clicked,
+        inputs=[model_type, image_input, text_input],
+        outputs=[orig_img, heatmap, bpe_display, index_slider],
+    ).then(
+        lambda max_val: (gr.update(visible=True), gr.update(visible=True, maximum=max_val, value=0), gr.update(visible=True), gr.update(visible=True)),
+        inputs=index_slider,
+        outputs=[prev_btn, index_slider, next_btn, bpe_display],
     )
     prev_btn.click(
+        lambda: (*update_index(-1), current_index),
         outputs=[heatmap, bpe_display, index_slider]
     )
     next_btn.click(
+        lambda: (*update_index(1), current_index),
         outputs=[heatmap, bpe_display, index_slider]
     )
     index_slider.change(
+            update_slider_index,
+            inputs=index_slider,
+            outputs=[heatmap, bpe_display]
+        )
 if __name__ == "__main__":
     demo.launch()