Spaces:

facebook
/

vc1-large

Runtime error

App Files Files Community

sneha commited on Apr 10, 2023

Commit

cf37148

1 Parent(s): 9ee651f

first commit

Browse files

Files changed (14) hide show

README.md +6 -6
app.py +105 -0
attn_helper.py +113 -0
imgs/adroit1.jpg +0 -0
imgs/cheetah.jpg +0 -0
imgs/ego4d.jpg +0 -0
imgs/ego4d_2.jpg +0 -0
imgs/ego4d_3.jpg +0 -0
imgs/kitchen.jpg +0 -0
imgs/reacher.jpg +0 -0
imgs/rearrange.jpg +0 -0
imgs/trifinger1.jpg +0 -0
imgs/walker.jpg +0 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Vc1 Large
-emoji: 👁
-colorFrom: purple
-colorTo: gray
 sdk: gradio
-sdk_version: 3.24.1
 app_file: app.py
 pinned: false
-license: cc-by-4.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Visual Cortex Demo
+emoji: 🏢
+colorFrom: red
+colorTo: purple
 sdk: gradio
+sdk_version: 3.23.0
 app_file: app.py
 pinned: false
+license: cc-by-nc-4.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import numpy as np
+import gradio as gr
+from huggingface_hub import hf_hub_download
+import omegaconf
+from hydra import utils
+import os
+import torch
+import matplotlib.pyplot as plt
+from attn_helper import VITAttentionGradRollout, overlay_attn
+import vc_models
+#import eaif_models
+import torchvision
+HF_TOKEN = os.environ['HF_ACC_TOKEN']
+eai_filepath = vc_models.__file__.split('src')[0]
+MODEL_DIR=os.path.join(os.path.dirname(eai_filepath),'model_ckpts')
+if not os.path.isdir(MODEL_DIR):
+    os.mkdir(MODEL_DIR)
+FILENAME = "config.yaml"
+BASE_MODEL_TUPLE = None
+LARGE_MODEL_TUPLE = None
+def get_model(model_name):
+    global BASE_MODEL_TUPLE,LARGE_MODEL_TUPLE
+    download_bin(model_name)
+    model = None
+    if BASE_MODEL_TUPLE is None and model_name == 'vc1-base':
+        repo_name = "facebook/" + model_name
+        model_cfg = omegaconf.OmegaConf.load(
+            hf_hub_download(repo_id=repo_name, filename=FILENAME,token=HF_TOKEN)
+        )
+        BASE_MODEL_TUPLE = utils.instantiate(model_cfg)
+        BASE_MODEL_TUPLE[0].eval()
+        model =  BASE_MODEL_TUPLE
+    elif LARGE_MODEL_TUPLE is None and model_name == 'vc1-large':
+        repo_name = "facebook/" + model_name
+        model_cfg = omegaconf.OmegaConf.load(
+            hf_hub_download(repo_id=repo_name, filename=FILENAME,token=HF_TOKEN)
+        )
+        LARGE_MODEL_TUPLE = utils.instantiate(model_cfg)
+        LARGE_MODEL_TUPLE[0].eval()
+        model =  LARGE_MODEL_TUPLE
+    elif model_name == 'vc1-base':
+        model = BASE_MODEL_TUPLE
+    elif model_name == 'vc1-large':
+        model = LARGE_MODEL_TUPLE
+    return model
+def download_bin(model):
+    bin_file = ""
+    if model == "vc1-large":
+        bin_file = 'vc1_vitl.pth'
+    elif model == "vc1-base":
+        bin_file = 'vc1_vitb.pth'
+    else:
+        raise NameError("model not found: " + model)
+    bin_path = os.path.join(MODEL_DIR,bin_file)
+    if not os.path.isfile(bin_path):
+        model_bin = hf_hub_download(repo_id=REPO_ID, filename='pytorch_model.bin',local_dir=MODEL_DIR,local_dir_use_symlinks=True,token=HF_TOKEN)
+        os.rename(model_bin, bin_path)
+def run_attn(input_img, model="vc1-large",fusion="min"):
+    download_bin(model)
+    model, embedding_dim, transform, metadata = get_model(model)
+    if input_img.shape[0] != 3:
+        input_img = input_img.transpose(2, 0, 1)
+    if(len(input_img.shape)== 3):
+        input_img = torch.tensor(input_img).unsqueeze(0)
+    input_img = input_img.float()
+    resize_transform = torchvision.transforms.Resize((250,250))
+    input_img = resize_transform(input_img)
+    x = transform(input_img)
+    attention_rollout = VITAttentionGradRollout(model,head_fusion=fusion)
+    y = model(x)
+    mask = attention_rollout.get_attn_mask()
+    attn_img = overlay_attn(input_img[0].permute(1,2,0),mask)
+    fig = plt.figure()
+    ax = fig.subplots()
+    print(y.shape)
+    im = ax.matshow(y.detach().numpy().reshape(16,-1))
+    plt.colorbar(im)
+    return attn_img, fig
+model_type = gr.Dropdown(
+            ["vc1-base", "vc1-large"], label="Model Size", value="vc1-large")
+input_img = gr.Image(shape=(250,250))
+input_button = gr.Radio(["min", "max", "mean"], value="min",label="Attention Head Fusion", info="How to combine the last layer attention across all 12 heads of the transformer.")
+output_img = gr.Image(shape=(250,250))
+output_plot = gr.Plot()
+markdown ="This is a demo for the Visual Cortex models. When passed an image input, it displays the attention of the last layer of the transformer.\n \
+         The user can decide how the attention heads will be combined. \
+         Along with the attention heatmap, it also displays the embedding values reshaped to a 16x48 or 16x64 grid."
+demo = gr.Interface(fn=run_attn, title="Visual Cortex Large Model", description=markdown,
+                    examples=[[os.path.join('./imgs',x),None,None]for x in os.listdir(os.path.join(os.getcwd(),'imgs')) if 'jpg' in x],
+                    inputs=[input_img,model_type,input_button],outputs=[output_img,output_plot])
+demo.launch()

attn_helper.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import cv2
+from PIL import Image
+import numpy as np
+import torch
+import PIL
+def overlay_attn(original_image,mask):
+    # Colormap and alpha for attention mask
+    # COLORMAP_OCEAN
+    # COLORMAP_OCEAN
+    colormap_attn, alpha_attn = cv2.COLORMAP_JET, 1 #0.85
+    # Resize mask to original image size
+    w, h = original_image.shape[0], original_image.shape[1]
+    mask = cv2.resize(mask / mask.max(), (h, w))[..., np.newaxis]
+    # Apply colormap to mask
+    cmap = cv2.applyColorMap(np.uint8(255 * mask), colormap_attn)
+    print(cmap.shape)
+    # Blend mask and original image
+    # grayscale_img =  cv2.cvtColor(np.uint8(original_image), cv2.COLOR_RGB2GRAY)
+    # grayscale_img = cv2.cvtColor(grayscale_img, cv2.COLOR_GRAY2RGB)
+    # alpha_blended = cv2.addWeighted(np.uint8(original_image),1, cmap, alpha_attn, 0)
+    alpha_blended = cv2.addWeighted(np.uint8(original_image),0.1, cmap, 0.9, 0)
+    # alpha_blended = cmap
+    # Save image
+    final_im = Image.fromarray(alpha_blended)
+    # final_im = final_im.crop((0,0,250,250))
+    return final_im
+class VITAttentionGradRollout:
+    '''
+        Expects timm ViT transformer model
+        Adapted from https://github.com/samiraabnar/attention_flow
+    '''
+    def __init__(self, model, head_fusion='min', discard_ratio=0):
+        self.model = model
+        self.head_fusion = head_fusion
+        self.discard_ratio = discard_ratio
+        print(list(model.blocks.children()))
+        self.attentions = {}
+        for idx, module in enumerate(list(model.blocks.children())):
+            module.attn.register_forward_hook(self.get_attention(f"attn{idx}"))
+    def get_attention(self, name):
+        def hook(module, input, output):
+            with torch.no_grad():
+                input = input[0]
+                B, N, C = input.shape
+                qkv = (
+                    module.qkv(input)
+                    .detach()
+                    .reshape(B, N, 3, module.num_heads, C // module.num_heads)
+                    .permute(2, 0, 3, 1, 4)
+                )
+                q, k, _ = (
+                    qkv[0],
+                    qkv[1],
+                    qkv[2],
+                )  # make torchscript happy (cannot use tensor as tuple)
+                attn = (q @ k.transpose(-2, -1)) * module.scale
+                attn = attn.softmax(dim=-1)
+                self.attentions[name] = attn
+        return hook
+    def get_attn_mask(self,k=0):
+        attn_key = "attn" + str()
+        result = torch.eye(self.attentions['attn0'].size(-1)).to(self.attentions['attn0'].device)
+        # result = torch.eye(self.attentions['attn2'].size(-1)).to(self.attentions['attn2'].device)
+        with torch.no_grad():
+            # for attention in self.attentions.values():
+            for k in range(11, len(self.attentions.keys())):
+                attention = self.attentions[f'attn{k}']
+                if self.head_fusion == "mean":
+                    attention_heads_fused = attention.mean(axis=1)
+                elif self.head_fusion == "max":
+                    attention_heads_fused = attention.max(axis=1)[0]
+                elif self.head_fusion == "min":
+                    attention_heads_fused = attention.min(axis=1)[0]
+                else:
+                    raise "Attention head fusion type Not supported"
+                # Drop the lowest attentions, but
+                # don't drop the class token
+                flat = attention_heads_fused.view(attention_heads_fused.size(0), -1)
+                _, indices = flat.topk(int(flat.size(-1)*self.discard_ratio), -1, False)
+                indices = indices[indices != 0]
+                flat[0, indices] = 0
+                I = torch.eye(attention_heads_fused.size(-1)).to(attention_heads_fused.device)
+                a = (attention_heads_fused + 1.0*I)/2
+                a = a / a.sum(dim=-1).unsqueeze(-1)
+                result = torch.matmul(a, result)
+        # Look at the total attention between the class token,
+        # and the image patches
+        mask = result[0, 0 , 1 :]
+        # In case of 224x224 image, this brings us from 196 to 14
+        width = int(mask.size(-1)**0.5)
+        mask = mask.reshape(width, width).detach().cpu().numpy()
+        mask = mask / np.max(mask)
+        return mask

imgs/adroit1.jpg ADDED Viewed

imgs/cheetah.jpg ADDED Viewed

imgs/ego4d.jpg ADDED Viewed

imgs/ego4d_2.jpg ADDED Viewed

imgs/ego4d_3.jpg ADDED Viewed

imgs/kitchen.jpg ADDED Viewed

imgs/reacher.jpg ADDED Viewed

imgs/rearrange.jpg ADDED Viewed

imgs/trifinger1.jpg ADDED Viewed

imgs/walker.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+omegaconf
+pillow
+opencv-python
+torch
+numpy
+hydra-core
+gradio
+huggingface_hub
+matplotlib
+git+https://github.com/facebookresearch/eai-vc.git@main#subdirectory=vc_models