CLIPAway

Sleeping

CLIPAway / AlphaCLIP /eval /rec_zs_test /generic_clip_pairs.py

hpc-yekin

initial commit

92e0882 about 1 year ago

5.76 kB

	import os
	import clip
	import json
	import argparse
	import ruamel.yaml as yaml

	from PIL import Image
	import torch
	import torchvision.transforms as transforms
	from tqdm import tqdm

	from albef.utils import *
	from executor import AlbefExecutor

	parser = argparse.ArgumentParser()
	parser.add_argument("--input_path", type=str, help="Path to input JSON file")
	parser.add_argument("--image_root", type=str, help="Path to directory containing images")
	parser.add_argument("--albef_path", type=str, default=None, help="Path to ALBEF model/config/etc. if the goal is to use ALBEF")
	parser.add_argument("--albef_itc", action="store_true", help="Use ITC output of ALBEF")
	parser.add_argument("--clip_model", type=str, help="CLIP model to use")
	parser.add_argument("--gpu", type=int, default=-1, help="Which gpu to use")
	parser.add_argument("--batch_size", type=int, default=32, help="Batch size for running CLIP")

	args = parser.parse_args()

	if args.albef_path is not None:
	executor = AlbefExecutor(checkpoint_path = os.path.join(args.albef_path, "checkpoint.pth"), config_path = os.path.join(args.albef_path, "config.yaml"), device = "cpu" if args.gpu < 0 else "cuda:"+str(args.gpu))
	model = executor.models[0]
	preprocess = executor.preprocesses[0]
	model = model.eval()
	else:
	model, preprocess = clip.load(args.clip_model, jit=False, device="cuda:"+str(args.gpu))
	preprocess.transforms[0] == transforms.Resize((model.visual.input_resolution, model.visual.input_resolution), transforms.InterpolationMode.BICUBIC)
	model = model.eval()
	input_file = open(args.input_path)
	data = json.load(input_file)
	input_file.close()
	correct = 0
	for i in tqdm(range(0, len(data), args.batch_size)):
	batch_images = []
	batch_text = []
	for datum in data[i:min(i+args.batch_size, len(data))]:
	img = Image.open(os.path.join(args.image_root, datum["image_filename"])).convert('RGB')
	batch_images.append(preprocess(img))
	if "text2" in datum:
	if args.albef_path is None:
	datum["text1"] = "a photo of "+datum["text1"]
	datum["text2"] = "a photo of "+datum["text2"]
	batch_text.append(datum["text1"])
	batch_text.append(datum["text2"])
	else:
	img2 = Image.open(os.path.join(args.image_root, datum["image_filename2"])).convert('RGB')
	batch_images.append(preprocess(img2))
	batch_text.append(datum["text1"])
	batch_images = torch.stack(batch_images).to("cuda:"+str(args.gpu))
	if args.albef_path is None:
	batch_text = clip.tokenize(batch_text).to("cuda:"+str(args.gpu))
	else:
	modified_text = [pre_caption(txt, executor.max_words) for txt in batch_text]
	batch_text = executor.tokenizer(modified_text, padding='longest', return_tensors="pt")
	for key in batch_text:
	batch_text[key] = batch_text[key].to(batch_images.device)

	with torch.no_grad():
	if args.albef_path is None:
	logits_per_image, logits_per_text = model(batch_images, batch_text)
	else:
	if not args.albef_itc:
	if batch_images.shape[0]*2 == batch_text.input_ids.shape[0]:
	batch_images = batch_images.unsqueeze(1).repeat(1, 2, 1, 1, 1).view(batch_images.shape[0]*2, batch_images.shape[1], batch_images.shape[2], batch_images.shape[3])
	else:
	assert batch_images.shape[0] ==2*batch_text.input_ids.shape[0]
	batch_text.input_ids = batch_text.input_ids.unsqueeze(1).repeat(1, 2, 1).view(batch_images.shape[0], -1)
	batch_text.attention_mask = batch_text.attention_mask.unsqueeze(1).repeat(1, 2, 1).view(batch_images.shape[0], -1)
	image_embeds = model.visual_encoder(batch_images)
	image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(batch_images.device)
	output = model.text_encoder(
	batch_text.input_ids,
	attention_mask = batch_text.attention_mask,
	encoder_hidden_states = image_embeds,
	encoder_attention_mask = image_atts,
	return_dict = True,
	)
	vl_embeddings = output.last_hidden_state[:,0,:]
	vl_output = model.itm_head(vl_embeddings)
	logits_per_image = vl_output[:,1:2].view(-1, 2)
	else:
	image_embeds = model.visual_encoder(batch_images)
	image_feat = torch.nn.functional.normalize(model.vision_proj(image_embeds[:,0,:]),dim=-1)
	text_output = model.text_encoder(batch_text.input_ids, attention_mask = batch_text.attention_mask,
	return_dict = True, mode = 'text')
	text_embeds = text_output.last_hidden_state
	text_feat = torch.nn.functional.normalize(model.text_proj(text_embeds[:,0,:]),dim=-1)
	sim = image_feat@text_feat.t()/model.temp
	logits_per_image = sim
	if args.albef_path is None or args.albef_itc:
	if logits_per_image.shape[0]*2 == logits_per_image.shape[1]:
	for j in range(logits_per_image.shape[0]):
	correct += 1 if logits_per_image[j,2j].item() > logits_per_image[j,2j+1].item() else 0
	else:
	assert logits_per_image.shape[0] == 2*logits_per_image.shape[1]
	for j in range(logits_per_image.shape[1]):
	correct += 1 if logits_per_image[2j,j].item() > logits_per_image[2j+1,j].item() else 0
	else:
	correct += (logits_per_image[:,0] > logits_per_image[:,1]).long().sum().item()

	print("Accuracy:", correct/len(data))