File size: 2,251 Bytes
f5fdf51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import tqdm
import argparse
import os.path as osp
import numpy as np
from PIL import Image
from transformers import (AutoTokenizer, AutoProcessor,
                          CLIPVisionModelWithProjection,
                          CLIPTextModelWithProjection)

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model',
        type=str,
        default='../pretrained_models/open-ai-clip-vit-base-patch32')
    parser.add_argument('--image-dir', type=str, default='data/samples.txt')
    parser.add_argument('--out-dir', type=str, default='')
    parser.add_argument('--out-file', type=str)

    args = parser.parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model)
    text_model = CLIPTextModelWithProjection.from_pretrained(args.model)
    processor = AutoProcessor.from_pretrained(args.model)

    # padding prompts
    device = 'cuda:0'
    text_model.to(device)
    texts = tokenizer(text=[' '], return_tensors='pt', padding=True)
    texts = texts.to(device)
    text_outputs = text_model(**texts)
    txt_feats = text_outputs.text_embeds
    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
    txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy()

    images = os.listdir(args.image_dir)
    category_embeds = []

    def _forward_vision_model(image_name):
        image_path = osp.join(args.image_dir, image_name)
        # category = image_name.split('-')[1]
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt", padding=True)
        image_outputs = vision_model(**inputs)
        img_feats = image_outputs.image_embeds
        # img_feats
        img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
        img_feats = img_feats.reshape(
            -1, img_feats.shape[-1])[0].cpu().data.numpy()
        category_embeds.append(img_feats)

    for image_ in tqdm.tqdm(images):
        _forward_vision_model(image_)
    category_embeds.append(txt_feats)
    category_embeds = np.stack(category_embeds)
    np.save(osp.join(args.out_dir, args.out_file), category_embeds)