Spaces:

TongkunGuan
/

Token-level_Text_Image_Foundation_Model

Running on Zero

App Files Files Community

TongkunGuan commited on Mar 11

Commit

3d2b840

verified ·

1 Parent(s): 9c191d1

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -80

app.py CHANGED Viewed

@@ -25,11 +25,6 @@ current_vis = []
 current_bpe = []
 current_index = 0
-# 设置初始状态
-initial_state = {
-    "vis": [],
-    "bpe": []
-}
 def load_model(check_type):
     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -59,9 +54,70 @@ def load_model(check_type):
     return model.to(device), tokenizer, transform, device
 def process_image(model, tokenizer, transform, device, check_type, image, text):
     global current_vis, current_bpe, current_index
     src_size = image.size
     if 'TokenOCR' in check_type:
         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
                                                   image_size=model.config.force_image_size,
@@ -72,50 +128,33 @@ def process_image(model, tokenizer, transform, device, check_type, image, text):
         pixel_values = torch.stack([transform(image)]).to(device)
         target_ratio = (1, 1)
-    # 文本处理
     text += ' '
     input_ids = tokenizer(text)['input_ids'][1:]
     input_ids = torch.tensor(input_ids, device=device)
-    # 获取嵌入
     with torch.no_grad():
         if 'R50' in check_type:
             text_embeds = model.language_embedding(input_ids)
         else:
             text_embeds = model.tok_embeddings(input_ids)
-        vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
-        print("vit_embeds",vit_embeds)
-        print("vit_embeds,shape",vit_embeds.shape)
-        print("target_ratio",target_ratio)
-        print("check_type",check_type)
         vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
-        # 计算相似度
         text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
         vit_embeds = vit_embeds / vit_embeds.norm(dim=-1, keepdim=True)
         similarity = text_embeds @ vit_embeds.T
         resized_size = size1 if size1 is not None else size2
-    # print(f"text_embeds shape: {text_embeds.shape}, numel: {text_embeds.numel()}") # text_embeds shape: torch.Size([4, 2048]), numel: 8192
-    # print(f"vit_embeds shape: {vit_embeds.shape}, numel: {vit_embeds.numel()}") # vit_embeds shape: torch.Size([9728, 2048]), numel: 19922944
-    # print(f"similarity shape: {similarity.shape}, numel: {similarity.numel()}")# similarity shape: torch.Size([4, 9728]), numel: 38912
-    # 生成可视化
     attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
-    # attn_map = similarity.reshape(len(text_embeds), *target_ratio)
     all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
-    current_vis = generate_similiarity_map([image], attn_map,
                                            [tokenizer.decode([i]) for i in input_ids],
                                            [], target_ratio, src_size)
     current_bpe = [tokenizer.decode([i]) for i in input_ids]
-    # current_bpe[-1] = 'Input text'
     current_bpe[-1] = text
-    print("current_vis",len(current_vis))
-    print("current_bpe",len(current_bpe))
-    return image, current_vis[0], current_bpe[0]
 # 事件处理函数
 def update_index(change):
@@ -127,24 +166,13 @@ def format_bpe_display(bpe):
     # 使用HTML标签来设置字体大小、颜色，加粗，并居中
     return f"<div style='text-align:center; font-size:20px;'><strong>Current BPE: <span style='color:red;'>{bpe}</span></strong></div>"
-# def update_slider_index(x):
-#     global current_vis, current_bpe, current_index
-#     print(f"x: {x}, current_vis length: {len(current_vis)}, current_bpe length: {len(current_bpe)}")
-#     if 0 <= x < len(current_vis) and 0 <= x < len(current_bpe):
-#         return current_vis[x], format_bpe_display(current_bpe[x])
-#     else:
-#         return None, "索引超出范围"
-# 状态更新函数，利用传递的状态（vis, bpe）
-# 使用状态信息来处理滑动条改变
-def update_slider_index(x, state):
-    vis = state['vis']
-    bpe = state['bpe']
-    if 0 <= x < len(vis):
-        return vis[x], format_bpe_display(bpe[x]), state
     else:
-        return None, "索引超出范围", state
@@ -202,37 +230,24 @@ with gr.Blocks(title="BPE Visualization Demo") as demo:
     #     return image, vis, bpe_text, slider_max_val
     @spaces.GPU
-    def on_run_clicked(model_type, image, text, state):
         model, tokenizer, transform, device = load_model(model_type)
-        current_index = 0  # Reset index when new image is processed
         image, vis, bpe = process_image(model, tokenizer, transform, device, model_type, image, text)
-        slider_max_val = len(bpe) - 1
-        bpe_text = format_bpe_display(bpe[current_index])
-        # 更新状态并返回
-        state['vis'] = vis
-        state['bpe'] = bpe
-        return image, vis[current_index], bpe_text, slider_max_val, state
-    # run_btn.click(
-    #     on_run_clicked,
-    #     inputs=[model_type, image_input, text_input],
-    #     outputs=[orig_img, heatmap, bpe_display, index_slider],
-    # ).then(
-    #     lambda max_val: (gr.update(visible=True), gr.update(visible=True, maximum=max_val, value=0), gr.update(visible=True), gr.update(visible=True)),
-    #     inputs=index_slider,
-    #     outputs=[prev_btn, index_slider, next_btn, bpe_display],
-    # )
-    # Gradio 按钮点击后的处理
-# Gradio 按钮点击后的处理
     run_btn.click(
         on_run_clicked,
-        inputs=[model_type, image_input, text_input, 'state'],
-        outputs=[orig_img, heatmap, bpe_display, index_slider, 'state'],
-        _js="{state: { vis: [], bpe: []}}"
     )
     prev_btn.click(
@@ -246,16 +261,12 @@ with gr.Blocks(title="BPE Visualization Demo") as demo:
     )
-    # index_slider.change(
-    #         update_slider_index,
-    #         inputs=index_slider,
-    #         outputs=[heatmap, bpe_display]
-    #     )
     index_slider.change(
-        update_slider_index,
-        inputs=[index_slider, 'state'],
-        outputs=[heatmap, bpe_display, 'state']
-    )
 if __name__ == "__main__":

 current_bpe = []
 current_index = 0
 def load_model(check_type):
     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     return model.to(device), tokenizer, transform, device
+# def process_image(model, tokenizer, transform, device, check_type, image, text):
+#     global current_vis, current_bpe, current_index
+#     src_size = image.size
+#     if 'TokenOCR' in check_type:
+#         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
+#                                                   image_size=model.config.force_image_size,
+#                                                   use_thumbnail=model.config.use_thumbnail,
+#                                                   return_ratio=True)
+#         pixel_values = torch.stack([transform(img) for img in images]).to(device)
+#     else:
+#         pixel_values = torch.stack([transform(image)]).to(device)
+#         target_ratio = (1, 1)
+    # # 文本处理
+    # text += ' '
+    # input_ids = tokenizer(text)['input_ids'][1:]
+    # input_ids = torch.tensor(input_ids, device=device)
+    # # 获取嵌入
+    # with torch.no_grad():
+    #     if 'R50' in check_type:
+    #         text_embeds = model.language_embedding(input_ids)
+    #     else:
+    #         text_embeds = model.tok_embeddings(input_ids)
+    #     vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
+    #     print("vit_embeds",vit_embeds)
+    #     print("vit_embeds,shape",vit_embeds.shape)
+    #     print("target_ratio",target_ratio)
+    #     print("check_type",check_type)
+    #     vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
+    #     # 计算相似度
+    #     text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+    #     vit_embeds = vit_embeds / vit_embeds.norm(dim=-1, keepdim=True)
+    #     similarity = text_embeds @ vit_embeds.T
+    #     resized_size = size1 if size1 is not None else size2
+    # # print(f"text_embeds shape: {text_embeds.shape}, numel: {text_embeds.numel()}") # text_embeds shape: torch.Size([4, 2048]), numel: 8192
+    # # print(f"vit_embeds shape: {vit_embeds.shape}, numel: {vit_embeds.numel()}") # vit_embeds shape: torch.Size([9728, 2048]), numel: 19922944
+    # # print(f"similarity shape: {similarity.shape}, numel: {similarity.numel()}")# similarity shape: torch.Size([4, 9728]), numel: 38912
+    # # 生成可视化
+    # attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
+    # # attn_map = similarity.reshape(len(text_embeds), *target_ratio)
+    # all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
+    # current_vis = generate_similiarity_map([image], attn_map,
+    #                                        [tokenizer.decode([i]) for i in input_ids],
+    #                                        [], target_ratio, src_size)
+    # current_bpe = [tokenizer.decode([i]) for i in input_ids]
+    # # current_bpe[-1] = 'Input text'
+    # current_bpe[-1] = text
+    # print("current_vis",len(current_vis))
+    # print("current_bpe",len(current_bpe))
+    # return image, current_vis[0], current_bpe[0]
 def process_image(model, tokenizer, transform, device, check_type, image, text):
     global current_vis, current_bpe, current_index
     src_size = image.size
+    # Ensure all processing is done on the correct device
+    image = image.to(device)
     if 'TokenOCR' in check_type:
         images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12,
                                                   image_size=model.config.force_image_size,
         pixel_values = torch.stack([transform(image)]).to(device)
         target_ratio = (1, 1)
     text += ' '
     input_ids = tokenizer(text)['input_ids'][1:]
     input_ids = torch.tensor(input_ids, device=device)
     with torch.no_grad():
         if 'R50' in check_type:
             text_embeds = model.language_embedding(input_ids)
         else:
             text_embeds = model.tok_embeddings(input_ids)
+        vit_embeds, size1 = model.forward_tokenocr(pixel_values)
         vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
         text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
         vit_embeds = vit_embeds / vit_embeds.norm(dim=-1, keepdim=True)
         similarity = text_embeds @ vit_embeds.T
         resized_size = size1 if size1 is not None else size2
     attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
     all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
+    current_vis = generate_similiarity_map([image.cpu()], attn_map.cpu(),
                                            [tokenizer.decode([i]) for i in input_ids],
                                            [], target_ratio, src_size)
     current_bpe = [tokenizer.decode([i]) for i in input_ids]
     current_bpe[-1] = text
+    return image.cpu(), current_vis[0], current_bpe[0]
 # 事件处理函数
 def update_index(change):
     # 使用HTML标签来设置字体大小、颜色，加粗，并居中
     return f"<div style='text-align:center; font-size:20px;'><strong>Current BPE: <span style='color:red;'>{bpe}</span></strong></div>"
+def update_slider_index(x):
+    global current_vis, current_bpe, current_index
+    print(f"x: {x}, current_vis length: {len(current_vis)}, current_bpe length: {len(current_bpe)}")
+    if 0 <= x < len(current_vis) and 0 <= x < len(current_bpe):
+        return current_vis[x], format_bpe_display(current_bpe[x])
     else:
+        return None, "索引超出范围"
     #     return image, vis, bpe_text, slider_max_val
     @spaces.GPU
+    def on_run_clicked(model_type, image, text):
+        global current_vis, current_bpe, current_index
+        current_index = 0
         model, tokenizer, transform, device = load_model(model_type)
         image, vis, bpe = process_image(model, tokenizer, transform, device, model_type, image, text)
+        slider_max_val = len(current_bpe) - 1
+        bpe_text = format_bpe_display(bpe)
+        return image, vis, bpe_text, slider_max_val
     run_btn.click(
         on_run_clicked,
+        inputs=[model_type, image_input, text_input],
+        outputs=[orig_img, heatmap, bpe_display, index_slider],
+    ).then(
+        lambda max_val: (gr.update(visible=True), gr.update(visible=True, maximum=max_val, value=0), gr.update(visible=True), gr.update(visible=True)),
+        inputs=index_slider,
+        outputs=[prev_btn, index_slider, next_btn, bpe_display],
     )
     prev_btn.click(
     )
     index_slider.change(
+            update_slider_index,
+            inputs=index_slider,
+            outputs=[heatmap, bpe_display]
+        )
 if __name__ == "__main__":