Spaces:
Runtime error
Runtime error
paul hilders
commited on
Commit
·
2dec07d
1
Parent(s):
c148272
Added first application file
Browse files- CLIP_explainability/Transformer-MM-Explainability +1 -0
- app.py +57 -0
CLIP_explainability/Transformer-MM-Explainability
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 6a2c3c9da3fc186878e0c2bcf238c3a4c76d8af8
|
app.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
# sys.path.append("../")
|
| 5 |
+
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import CLIP.clip as clip
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
from clip_grounding.utils.image import pad_to_square
|
| 12 |
+
from clip_grounding.datasets.png import (
|
| 13 |
+
overlay_relevance_map_on_image,
|
| 14 |
+
)
|
| 15 |
+
from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text
|
| 16 |
+
|
| 17 |
+
clip.clip._MODELS = {
|
| 18 |
+
"ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
|
| 19 |
+
"ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
+
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
|
| 24 |
+
|
| 25 |
+
# Gradio Section:
|
| 26 |
+
def run_demo(image, text):
|
| 27 |
+
orig_image = pad_to_square(image)
|
| 28 |
+
img = preprocess(orig_image).unsqueeze(0).to(device)
|
| 29 |
+
text_input = clip.tokenize([text]).to(device)
|
| 30 |
+
|
| 31 |
+
R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
|
| 32 |
+
|
| 33 |
+
image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
|
| 34 |
+
overlapped = overlay_relevance_map_on_image(image, image_relevance)
|
| 35 |
+
|
| 36 |
+
text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)
|
| 37 |
+
|
| 38 |
+
highlighted_text = []
|
| 39 |
+
for i, token in enumerate(text_tokens_decoded):
|
| 40 |
+
highlighted_text.append((str(token), float(text_scores[i])))
|
| 41 |
+
|
| 42 |
+
return overlapped, highlighted_text
|
| 43 |
+
|
| 44 |
+
input_img = gr.inputs.Image(type='pil', label="Original Image")
|
| 45 |
+
input_txt = "text"
|
| 46 |
+
inputs = [input_img, input_txt]
|
| 47 |
+
|
| 48 |
+
outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
iface = gr.Interface(fn=run_demo,
|
| 52 |
+
inputs=inputs,
|
| 53 |
+
outputs=outputs,
|
| 54 |
+
title="CLIP Grounding Explainability",
|
| 55 |
+
description="A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.",
|
| 56 |
+
examples=[["harrypotter.png", "Harry"], ["harrypotter.png", "Hermione"], ["harrypotter.png", "Ron"], ["Amsterdam.png", "Amsterdam canal"], ["Amsterdam.png", "Old buildings"], ["Amsterdam.png", "Pink flowers"], ["dogs_on_bed.png", "Two dogs"], ["dogs_on_bed.png", "Book"], ["dogs_on_bed.png", "Cat"], ["Solar_system.png", "Sun"], ["Solar_system.png", "Earth"]])
|
| 57 |
+
iface.launch(debug=True)
|