CLIPAway

Sleeping

File size: 5,760 Bytes

92e0882

import os
import clip
import json
import argparse
import ruamel.yaml as yaml

from PIL import Image
import torch
import torchvision.transforms as transforms
from tqdm import tqdm

from albef.utils import *
from executor import AlbefExecutor

parser = argparse.ArgumentParser()
parser.add_argument("--input_path", type=str, help="Path to input JSON file")
parser.add_argument("--image_root", type=str, help="Path to directory containing images")
parser.add_argument("--albef_path", type=str, default=None, help="Path to ALBEF model/config/etc. if the goal is to use ALBEF")
parser.add_argument("--albef_itc", action="store_true", help="Use ITC output of ALBEF")
parser.add_argument("--clip_model", type=str, help="CLIP model to use")
parser.add_argument("--gpu", type=int, default=-1, help="Which gpu to use")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size for running CLIP")

args = parser.parse_args()

if args.albef_path is not None:
    executor = AlbefExecutor(checkpoint_path = os.path.join(args.albef_path, "checkpoint.pth"), config_path = os.path.join(args.albef_path, "config.yaml"), device = "cpu" if args.gpu < 0 else "cuda:"+str(args.gpu))
    model = executor.models[0]
    preprocess = executor.preprocesses[0]
    model = model.eval()
else:
    model, preprocess = clip.load(args.clip_model, jit=False, device="cuda:"+str(args.gpu))
    preprocess.transforms[0] == transforms.Resize((model.visual.input_resolution, model.visual.input_resolution), transforms.InterpolationMode.BICUBIC)
    model = model.eval()
input_file = open(args.input_path)
data = json.load(input_file)
input_file.close()
correct = 0
for i in tqdm(range(0, len(data), args.batch_size)):
    batch_images = []
    batch_text = []
    for datum in data[i:min(i+args.batch_size, len(data))]:
        img = Image.open(os.path.join(args.image_root, datum["image_filename"])).convert('RGB')
        batch_images.append(preprocess(img))
        if "text2" in datum:
            if args.albef_path is None:
                datum["text1"] = "a photo of "+datum["text1"]
                datum["text2"] = "a photo of "+datum["text2"]
            batch_text.append(datum["text1"])
            batch_text.append(datum["text2"])
        else:
            img2 = Image.open(os.path.join(args.image_root, datum["image_filename2"])).convert('RGB')
            batch_images.append(preprocess(img2))
            batch_text.append(datum["text1"])
    batch_images = torch.stack(batch_images).to("cuda:"+str(args.gpu))
    if args.albef_path is None:
        batch_text = clip.tokenize(batch_text).to("cuda:"+str(args.gpu))
    else:
        modified_text = [pre_caption(txt, executor.max_words) for txt in batch_text]
        batch_text = executor.tokenizer(modified_text, padding='longest', return_tensors="pt")
        for key in batch_text:
            batch_text[key] = batch_text[key].to(batch_images.device)

    with torch.no_grad():
        if args.albef_path is None:
            logits_per_image, logits_per_text = model(batch_images, batch_text)
        else:
            if not args.albef_itc:
                if batch_images.shape[0]*2 == batch_text.input_ids.shape[0]:
                    batch_images = batch_images.unsqueeze(1).repeat(1, 2, 1, 1, 1).view(batch_images.shape[0]*2, batch_images.shape[1], batch_images.shape[2], batch_images.shape[3])
                else:
                    assert batch_images.shape[0] ==2*batch_text.input_ids.shape[0]
                    batch_text.input_ids = batch_text.input_ids.unsqueeze(1).repeat(1, 2, 1).view(batch_images.shape[0], -1)
                    batch_text.attention_mask = batch_text.attention_mask.unsqueeze(1).repeat(1, 2, 1).view(batch_images.shape[0], -1)
                image_embeds = model.visual_encoder(batch_images)
                image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(batch_images.device)
                output = model.text_encoder(
                    batch_text.input_ids,
                    attention_mask = batch_text.attention_mask,
                    encoder_hidden_states = image_embeds,
                    encoder_attention_mask = image_atts,      
                    return_dict = True,
                )
                vl_embeddings = output.last_hidden_state[:,0,:]
                vl_output = model.itm_head(vl_embeddings)
                logits_per_image = vl_output[:,1:2].view(-1, 2)
            else:
                image_embeds = model.visual_encoder(batch_images)
                image_feat = torch.nn.functional.normalize(model.vision_proj(image_embeds[:,0,:]),dim=-1) 
                text_output = model.text_encoder(batch_text.input_ids, attention_mask = batch_text.attention_mask,                 
                                                 return_dict = True, mode = 'text')            
                text_embeds = text_output.last_hidden_state
                text_feat = torch.nn.functional.normalize(model.text_proj(text_embeds[:,0,:]),dim=-1)     
                sim = image_feat@text_feat.t()/model.temp
                logits_per_image = sim
    if args.albef_path is None or args.albef_itc:
        if logits_per_image.shape[0]*2 == logits_per_image.shape[1]:
            for j in range(logits_per_image.shape[0]):
                correct += 1 if logits_per_image[j,2*j].item() > logits_per_image[j,2*j+1].item() else 0
        else:
            assert logits_per_image.shape[0] == 2*logits_per_image.shape[1]
            for j in range(logits_per_image.shape[1]):
                correct += 1 if logits_per_image[2*j,j].item() > logits_per_image[2*j+1,j].item() else 0
    else:
        correct += (logits_per_image[:,0] > logits_per_image[:,1]).long().sum().item()

print("Accuracy:", correct/len(data))