File size: 1,411 Bytes
c178b11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
749cd3a
c178b11
 
 
749cd3a
c178b11
 
749cd3a
c178b11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python3

from transformers.utils import logging
logging.set_verbosity_error()

from transformers import BlipForImageTextRetrieval
from transformers import AutoProcessor
from PIL import Image
import math, random, time
# import random
# import time
import torch


# multi-modal Model
#   accepts both text and image content (or audio, etc.)
print("loading model ...")
model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

print("loading image ...")
raw_image =  Image.open('./assets/pot-o-gold-my-little-pony-Derpy.jpeg').convert('RGB')

print("processing ...")
statements = [
    "an image of a horse",
    "a horse and a rainbow",
    "a pony and a rainbow",
    "a unicorn and a rainbow",
    "a pony in a forest",
    "a rainbox over a lake",
    "a horse running through the forest",
    "two eyes that do not match",
    "equine joy",
    "a stallion and gold coins",
    "a mare and gold coins"
]
while True:
    index = math.floor(random.random() * len(statements))
    text = statements[index]

    inputs = processor(images=raw_image,
                       text=text,
                       return_tensors="pt") # PyTorch tensors
    itm_scores = model(**inputs)[0]
    itm_score = torch.nn.functional.softmax(itm_scores, dim=1)

    print(f"""'{text}' => {itm_score[0][1]:.2f}""")