cantremember's picture
print needs parens
749cd3a
#!/usr/bin/env python3
from transformers.utils import logging
logging.set_verbosity_error()
from transformers import BlipForImageTextRetrieval
from transformers import AutoProcessor
from PIL import Image
import math, random, time
# import random
# import time
import torch
# multi-modal Model
# accepts both text and image content (or audio, etc.)
print("loading model ...")
model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
print("loading image ...")
raw_image = Image.open('./assets/pot-o-gold-my-little-pony-Derpy.jpeg').convert('RGB')
print("processing ...")
statements = [
"an image of a horse",
"a horse and a rainbow",
"a pony and a rainbow",
"a unicorn and a rainbow",
"a pony in a forest",
"a rainbox over a lake",
"a horse running through the forest",
"two eyes that do not match",
"equine joy",
"a stallion and gold coins",
"a mare and gold coins"
]
while True:
index = math.floor(random.random() * len(statements))
text = statements[index]
inputs = processor(images=raw_image,
text=text,
return_tensors="pt") # PyTorch tensors
itm_scores = model(**inputs)[0]
itm_score = torch.nn.functional.softmax(itm_scores, dim=1)
print(f"""'{text}' => {itm_score[0][1]:.2f}""")