Spaces:

TongkunGuan
/

Token-level_Text_Image_Foundation_Model

Running

File size: 8,495 Bytes

841bef5
 
 
 
 
 
 
 
 
 
 
 
 
0afd727
841bef5
 
 
0afd727
 
841bef5
 
 
312b679
464ed7e
 
 
 
0afd727
841bef5
333ea05
 
841bef5
 
 
 
 
 
 
 
 
 
 
 
0afd727
841bef5
312b679
841bef5
 
 
 
 
 
 
 
 
 
464ed7e
 
841bef5
 
 
 
 
 
b70aad2
841bef5
b70aad2
841bef5
 
b70aad2
841bef5
b70aad2
 
 
 
841bef5
 
 
 
 
b70aad2
 
464ed7e
 
 
 
841bef5
b70aad2
 
841bef5
 
 
 
 
464ed7e
 
 
 
 
 
841bef5
464ed7e
b70aad2
464ed7e
 
 
b70aad2
464ed7e
 
 
 
 
 
 
 
 
b2a190e
 
464ed7e
b2a190e
 
 
 
 
 
 
464ed7e
 
 
 
 
b2a190e
464ed7e
a1cdc55
333ea05
841bef5
464ed7e
758eccd
841bef5
 
 
 
f39b1b0
841bef5
464ed7e
841bef5
 
 
464ed7e
841bef5
464ed7e
841bef5
 
 
 
 
 
 
 
 
 
 
 
 
464ed7e
 
 
 
 
 
 
 
 
a1cdc55
464ed7e
0afd727
464ed7e
 
b2a190e
464ed7e
b2a190e
 
 
 
 
464ed7e
 
 
 
b2a190e
464ed7e
b2a190e
 
464ed7e
aa84990
b2a190e
 
aa84990
464ed7e
aa84990
b2a190e
 
841bef5
a1cdc55
464ed7e
 
 
3d2b840
841bef5
79d5e07

import os
import argparse
import numpy as np
from PIL import Image
import torch
import torchvision.transforms as T
from transformers import AutoTokenizer
import gradio as gr
from resnet50 import build_model
from utils import generate_similiarity_map, post_process, load_tokenizer, build_transform_R50
from utils import IMAGENET_MEAN, IMAGENET_STD
from internvl.train.dataset import dynamic_preprocess
from internvl.model.internvl_chat import InternVLChatModel
import spaces

# 模型配置
CHECKPOINTS = {
    "TokenFD_4096_English_seg": "TongkunGuan/TokenFD_4096_English_seg",
    "TokenFD_2048_Bilingual_seg": "TongkunGuan/TokenFD_2048_Bilingual_seg",
}

# 全局变量
HF_TOKEN = os.getenv("HF_TOKEN")
current_vis = []
current_bpe = []
current_index = 0


def load_model(check_type):
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("cuda")
    if check_type == 'R50':
        tokenizer = load_tokenizer('tokenizer_path')
        model = build_model(argparse.Namespace()).eval()
        model.load_state_dict(torch.load(CHECKPOINTS['R50'], map_location='cpu')['model'])
        transform = build_transform_R50(normalize_type='imagenet')

    elif check_type == 'R50_siglip':
        tokenizer = load_tokenizer('tokenizer_path')
        model = build_model(argparse.Namespace()).eval()
        model.load_state_dict(torch.load(CHECKPOINTS['R50_siglip'], map_location='cpu')['model'])
        transform = build_transform_R50(normalize_type='imagenet')

    elif 'TokenFD' in check_type:
        model_path = CHECKPOINTS[check_type]
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False, use_auth_token=HF_TOKEN)
        model = InternVLChatModel.from_pretrained(model_path, torch_dtype=torch.bfloat16).eval()
        transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB')),
            T.Resize((224, 224)),
            T.ToTensor(),
            T.Normalize(IMAGENET_MEAN, IMAGENET_STD)
        ])
    
    return model.to(device), tokenizer, transform, device

def process_image(model, tokenizer, transform, device, check_type, image, text):
    global current_vis, current_bpe, current_index
    src_size = image.size
    if 'TokenOCR' in check_type:
        images, target_ratio = dynamic_preprocess(image, min_num=1, max_num=12, 
                                                  image_size=model.config.force_image_size,
                                                  use_thumbnail=model.config.use_thumbnail,
                                                  return_ratio=True)
        pixel_values = torch.stack([transform(img) for img in images]).to(device)
    else:
        pixel_values = torch.stack([transform(image)]).to(device)
        target_ratio = (1, 1)

    # 文本处理
    text += ' '
    input_ids = tokenizer(text)['input_ids'][1:]
    input_ids = torch.tensor(input_ids, device=device)
    
    # 获取嵌入
    with torch.no_grad():
        if 'R50' in check_type:
            text_embeds = model.language_embedding(input_ids)
        else:
            text_embeds = model.tok_embeddings(input_ids)
        
        vit_embeds, size1 = model.forward_tokenocr(pixel_values.to(torch.bfloat16).to(device))
        print("vit_embeds",vit_embeds)
        print("vit_embeds,shape",vit_embeds.shape)
        print("target_ratio",target_ratio)
        print("check_type",check_type)
        vit_embeds, size2 = post_process(vit_embeds, target_ratio, check_type)
        
        # 计算相似度
        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
        vit_embeds = vit_embeds / vit_embeds.norm(dim=-1, keepdim=True)
        similarity = text_embeds @ vit_embeds.T
        resized_size = size1 if size1 is not None else size2

    # print(f"text_embeds shape: {text_embeds.shape}, numel: {text_embeds.numel()}") # text_embeds shape: torch.Size([4, 2048]), numel: 8192
    # print(f"vit_embeds shape: {vit_embeds.shape}, numel: {vit_embeds.numel()}") # vit_embeds shape: torch.Size([9728, 2048]), numel: 19922944
    # print(f"similarity shape: {similarity.shape}, numel: {similarity.numel()}")# similarity shape: torch.Size([4, 9728]), numel: 38912


    # 生成可视化
    attn_map = similarity.reshape(len(text_embeds), resized_size[0], resized_size[1])
    # attn_map = similarity.reshape(len(text_embeds), *target_ratio)
    all_bpe_strings = [tokenizer.decode(input_id) for input_id in input_ids]
    current_vis = generate_similiarity_map([image], attn_map, 
                                           [tokenizer.decode([i]) for i in input_ids], 
                                           [], target_ratio, src_size)
    
    current_bpe = [tokenizer.decode([i]) for i in input_ids]
    # current_bpe[-1] = 'Input text'
    current_bpe[-1] = text

    return image, current_vis[0], current_bpe[0]

# 事件处理函数
def update_index(change):
    global current_vis, current_bpe, current_index
    
    # 限制索引范围
    current_index = max(0, min(len(current_vis) - 1, current_index + change))
    
    # 处理按钮可见性
    prev_visible = current_index > 0
    next_visible = current_index < len(current_vis) - 1
    
    return current_vis[current_index], format_bpe_display(current_bpe[current_index]), prev_visible, next_visible


def format_bpe_display(bpe):
    # 使用HTML标签来设置字体大小、颜色，加粗，并居中
    return f"<div style='text-align:center; font-size:20px;'><strong>Current BPE: <span style='color:red;'>{bpe}</span></strong></div>"





# Gradio界面
with gr.Blocks(title="BPE Visualization Demo") as demo:
    gr.Markdown("## BPE Visualization Demo - TokenFD基座模型能力可视化")
    
    with gr.Row():
        with gr.Column(scale=0.5):
            model_type = gr.Dropdown(
                choices=["TokenFD_4096_English_seg", "TokenFD_2048_Bilingual_seg", "R50", "R50_siglip"],
                label="Select model type",
                value="TokenOCR_4096_English_seg"  # 设置默认值为第一个选项
            )
            image_input = gr.Image(label="Upload images", type="pil")
            text_input = gr.Textbox(label="Input text")

            run_btn = gr.Button("RUN")
            
            gr.Examples(
                examples=[
                    [os.path.join("examples", "examples0.jpg"), "Veterans and Benefits"],
                    [os.path.join("examples", "examples1.jpg"), "Refreshers"],
                    [os.path.join("examples", "examples2.png"), "Vision Transformer"]
                ],
                inputs=[image_input, text_input],
                label="Sample input"
            )
        
        with gr.Column(scale=2):
            gr.Markdown("<p style='font-size:20px;'><span style='color:red;'>If the input text is not included in the image</span>, the attention map will show a lot of noise (the actual response value is very low), since we normalize the attention map according to the relative value.</p>")

            with gr.Row():
                orig_img = gr.Image(label="Original picture", interactive=False)
                heatmap = gr.Image(label="BPE visualization", interactive=False)
            
            with gr.Row() as controls:
                prev_btn = gr.Button("⬅ Last", visible=False)
                next_btn = gr.Button("⮕ Next", visible=False)
            
            bpe_display = gr.Markdown("Current BPE: ", visible=False)

    # 事件处理
    @spaces.GPU
    def on_run_clicked(model_type, image, text):
        global current_vis, current_bpe, current_index
        current_index = 0  # 重新处理新图片时，索引归零
        image, vis, bpe = process_image(*load_model(model_type), model_type, image, text)
        
        if len(current_vis) > 1:
            prev_visible, next_visible = True, True
        else:
            prev_visible, next_visible = False, False
        print("len_current_vis",len(current_vis))
        print("len_current_bpe",len(current_bpe))
        print("current_vis",current_vis)
        print("current_bpe",current_bpe)
        return image, vis, format_bpe_display(bpe), prev_visible, next_visible
    


    
    prev_btn.click(
        lambda: update_index(-1),
        outputs=[heatmap, bpe_display, prev_btn, next_btn]
    )
    
    next_btn.click(
        lambda: update_index(1),
        outputs=[heatmap, bpe_display, prev_btn, next_btn]
    )





if __name__ == "__main__":
    demo.launch()