kokuma commited on
Commit
c1ba109
·
verified ·
1 Parent(s): 353f0f6

Default to English

Browse files
Files changed (1) hide show
  1. app.py +1221 -67
app.py CHANGED
@@ -8,20 +8,1024 @@ import open_clip
8
  from PIL import Image
9
  import requests
10
  import torch
 
11
  # import torch.nn.functional as F
12
  import numpy as np
13
 
14
 
15
  # GLOBAL VARIABLES
16
- openai_en_classes = ["tench", "goldfish", "great white shark", "tiger shark", "hammerhead shark", "electric ray", "stingray", "rooster", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "American robin", "bulbul", "jay", "magpie", "chickadee", "American dipper", "kite (bird of prey)", "bald eagle", "vulture", "great grey owl", "fire salamander", "smooth newt", "newt", "spotted salamander", "axolotl", "American bullfrog", "tree frog", "tailed frog", "loggerhead sea turtle", "leatherback sea turtle", "mud turtle", "terrapin", "box turtle", "banded gecko", "green iguana", "Carolina anole", "desert grassland whiptail lizard", "agama", "frilled-necked lizard", "alligator lizard", "Gila monster", "European green lizard", "chameleon", "Komodo dragon", "Nile crocodile", "American alligator", "triceratops", "worm snake", "ring-necked snake", "eastern hog-nosed snake", "smooth green snake", "kingsnake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "African rock python", "Indian cobra", "green mamba", "sea snake", "Saharan horned viper", "eastern diamondback rattlesnake", "sidewinder rattlesnake", "trilobite", "harvestman", "scorpion", "yellow garden spider", "barn spider", "European garden spider", "southern black widow", "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie grouse", "peafowl", "quail", "partridge", "african grey parrot", "macaw", "sulphur-crested cockatoo", "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "duck", "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "red king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron", "great egret", "bittern bird", "crane bird", "limpkin", "common gallinule", "American coot", "bustard", "ruddy turnstone", "dunlin", "common redshank", "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion", "Chihuahua", "Japanese Chin", "Maltese", "Pekingese", "Shih Tzu", "King Charles Spaniel", "Papillon", "toy terrier", "Rhodesian Ridgeback", "Afghan Hound", "Basset Hound", "Beagle", "Bloodhound", "Bluetick Coonhound", "Black and Tan Coonhound", "Treeing Walker Coonhound", "English foxhound", "Redbone Coonhound", "borzoi", "Irish Wolfhound", "Italian Greyhound", "Whippet", "Ibizan Hound", "Norwegian Elkhound", "Otterhound", "Saluki", "Scottish Deerhound", "Weimaraner", "Staffordshire Bull Terrier", "American Staffordshire Terrier", "Bedlington Terrier", "Border Terrier", "Kerry Blue Terrier", "Irish Terrier", "Norfolk Terrier", "Norwich Terrier", "Yorkshire Terrier", "Wire Fox Terrier", "Lakeland Terrier", "Sealyham Terrier", "Airedale Terrier", "Cairn Terrier", "Australian Terrier", "Dandie Dinmont Terrier", "Boston Terrier", "Miniature Schnauzer", "Giant Schnauzer", "Standard Schnauzer", "Scottish Terrier", "Tibetan Terrier", "Australian Silky Terrier", "Soft-coated Wheaten Terrier", "West Highland White Terrier", "Lhasa Apso", "Flat-Coated Retriever", "Curly-coated Retriever", "Golden Retriever", "Labrador Retriever", "Chesapeake Bay Retriever", "German Shorthaired Pointer", "Vizsla", "English Setter", "Irish Setter", "Gordon Setter", "Brittany dog", "Clumber Spaniel", "English Springer Spaniel", "Welsh Springer Spaniel", "Cocker Spaniel", "Sussex Spaniel", "Irish Water Spaniel", "Kuvasz", "Schipperke", "Groenendael dog", "Malinois", "Briard", "Australian Kelpie", "Komondor", "Old English Sheepdog", "Shetland Sheepdog", "collie", "Border Collie", "Bouvier des Flandres dog", "Rottweiler", "German Shepherd Dog", "Dobermann", "Miniature Pinscher", "Greater Swiss Mountain Dog", "Bernese Mountain Dog", "Appenzeller Sennenhund", "Entlebucher Sennenhund", "Boxer", "Bullmastiff", "Tibetan Mastiff", "French Bulldog", "Great Dane", "St. Bernard", "husky", "Alaskan Malamute", "Siberian Husky", "Dalmatian", "Affenpinscher", "Basenji", "pug", "Leonberger", "Newfoundland dog", "Great Pyrenees dog", "Samoyed", "Pomeranian", "Chow Chow", "Keeshond", "brussels griffon", "Pembroke Welsh Corgi", "Cardigan Welsh Corgi", "Toy Poodle", "Miniature Poodle", "Standard Poodle", "Mexican hairless dog (xoloitzcuintli)", "grey wolf", "Alaskan tundra wolf", "red wolf or maned wolf", "coyote", "dingo", "dhole", "African wild dog", "hyena", "red fox", "kit fox", "Arctic fox", "grey fox", "tabby cat", "tiger cat", "Persian cat", "Siamese cat", "Egyptian Mau", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah", "brown bear", "American black bear", "polar bear", "sloth bear", "mongoose", "meerkat", "tiger beetle", "ladybug", "ground beetle", "longhorn beetle", "leaf beetle", "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper", "cricket insect", "stick insect", "cockroach", "praying mantis", "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "red admiral butterfly", "ringlet butterfly", "monarch butterfly", "small white butterfly", "sulphur butterfly", "gossamer-winged butterfly", "starfish", "sea urchin", "sea cucumber", "cottontail rabbit", "hare", "Angora rabbit", "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "common sorrel horse", "zebra", "pig", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison", "ram (adult male sheep)", "bighorn sheep", "Alpine ibex", "hartebeest", "impala (antelope)", "gazelle", "arabian camel", "llama", "weasel", "mink", "European polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas monkey", "baboon", "macaque", "langur", "black-and-white colobus", "proboscis monkey", "marmoset", "white-headed capuchin", "howler monkey", "titi monkey", "Geoffroy's spider monkey", "common squirrel monkey", "ring-tailed lemur", "indri", "Asian elephant", "African bush elephant", "red panda", "giant panda", "snoek fish", "eel", "silver salmon", "rock beauty fish", "clownfish", "sturgeon", "gar fish", "lionfish", "pufferfish", "abacus", "abaya", "academic gown", "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance", "amphibious vehicle", "analog clock", "apiary", "apron", "trash can", "assault rifle", "backpack", "bakery", "balance beam", "balloon", "ballpoint pen", "Band-Aid", "banjo", "baluster / handrail", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel", "wheelbarrow", "baseball", "basketball", "bassinet", "bassoon", "swimming cap", "bath towel", "bathtub", "station wagon", "lighthouse", "beaker", "military hat (bearskin or shako)", "beer bottle", "beer glass", "bell tower", "baby bib", "tandem bicycle", "bikini", "ring binder", "binoculars", "birdhouse", "boathouse", "bobsleigh", "bolo tie", "poke bonnet", "bookcase", "bookstore", "bottle cap", "hunting bow", "bow tie", "brass memorial plaque", "bra", "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest", "high-speed train", "butcher shop", "taxicab", "cauldron", "candle", "cannon", "canoe", "can opener", "cardigan", "car mirror", "carousel", "tool kit", "cardboard box / carton", "car wheel", "automated teller machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", "mobile phone", "chain", "chain-link fence", "chain mail", "chainsaw", "storage chest", "chiffonier", "bell or wind chime", "china cabinet", "Christmas stocking", "church", "movie theater", "cleaver", "cliff dwelling", "cloak", "clogs", "cocktail shaker", "coffee mug", "coffeemaker", "spiral or coil", "combination lock", "computer keyboard", "candy store", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot", "cowboy hat", "cradle", "construction crane", "crash helmet", "crate", "infant bed", "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "rotary dial telephone", "diaper", "digital clock", "digital watch", "dining table", "dishcloth", "dishwasher", "disc brake", "dock", "dog sled", "dome", "doormat", "drilling rig", "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center", "envelope", "espresso machine", "face powder", "feather boa", "filing cabinet", "fireboat", "fire truck", "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster bed", "freight car", "French horn", "frying pan", "fur coat", "garbage truck", "gas mask or respirator", "gas pump", "goblet", "go-kart", "golf ball", "golf cart", "gondola", "gong", "gown", "grand piano", "greenhouse", "radiator grille", "grocery store", "guillotine", "hair clip", "hair spray", "half-track", "hammer", "hamper", "hair dryer", "hand-held computer", "handkerchief", "hard disk drive", "harmonica", "harp", "combine harvester", "hatchet", "holster", "home theater", "honeycomb", "hook", "hoop skirt", "gymnastic horizontal bar", "horse-drawn vehicle", "hourglass", "iPod", "clothes iron", "carved pumpkin", "jeans", "jeep", "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat", "ladle", "lampshade", "laptop computer", "lawn mower", "lens cap", "letter opener", "library", "lifeboat", "lighter", "limousine", "ocean liner", "lipstick", "slip-on shoe", "lotion", "music speaker", "loupe magnifying glass", "sawmill", "magnetic compass", "messenger bag", "mailbox", "tights", "one-piece bathing suit", "manhole cover", "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine cabinet", "megalith", "microphone", "microwave oven", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile", "mitten", "mixing bowl", "mobile home", "ford model t", "modem", "monastery", "monitor", "moped", "mortar and pestle", "graduation cap", "mosque", "mosquito net", "vespa", "mountain bike", "tent", "computer mouse", "mousetrap", "moving van", "muzzle", "metal nail", "neck brace", "necklace", "baby pacifier", "notebook computer", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "pipe organ", "oscilloscope", "overskirt", "bullock cart", "oxygen mask", "product packet / packaging", "paddle", "paddle wheel", "padlock", "paintbrush", "pajamas", "palace", "pan flute", "paper towel", "parachute", "parallel bars", "park bench", "parking meter", "railroad car", "patio", "payphone", "pedestal", "pencil case", "pencil sharpener", "perfume", "Petri dish", "photocopier", "plectrum", "Pickelhaube", "picket fence", "pickup truck", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate ship", "drink pitcher", "block plane", "planetarium", "plastic bag", "plate rack", "farm plow", "plunger", "Polaroid camera", "pole", "police van", "poncho", "pool table", "soda bottle", "plant pot", "potter's wheel", "power drill", "prayer rug", "printer", "prison", "missile", "projector", "hockey puck", "punching bag", "purse", "quill", "quilt", "race car", "racket", "radiator", "radio", "radio telescope", "rain barrel", "recreational vehicle", "fishing casting reel", "reflex camera", "refrigerator", "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "eraser", "rugby ball", "ruler measuring stick", "sneaker", "safe", "safety pin", "salt shaker", "sandal", "sarong", "saxophone", "scabbard", "weighing scale", "school bus", "schooner", "scoreboard", "CRT monitor", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe store", "shoji screen / room divider", "shopping basket", "shopping cart", "shovel", "shower cap", "shower curtain", "ski", "balaclava ski mask", "sleeping bag", "slide rule", "sliding door", "slot machine", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar thermal collector", "sombrero", "soup bowl", "keyboard space bar", "space heater", "space shuttle", "spatula", "motorboat", "spider web", "spindle", "sports car", "spotlight", "stage", "steam locomotive", "through arch bridge", "steel drum", "stethoscope", "scarf", "stone wall", "stopwatch", "stove", "strainer", "tram", "stretcher", "couch", "stupa", "submarine", "suit", "sundial", "sunglasses", "sunglasses", "sunscreen", "suspension bridge", "mop", "sweatshirt", "swim trunks / shorts", "swing", "electrical switch", "syringe", "table lamp", "tank", "tape player", "teapot", "teddy bear", "television", "tennis ball", "thatched roof", "front curtain", "thimble", "threshing machine", "throne", "tile roof", "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toy store", "tractor", "semi-trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "hot tub", "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright piano", "vacuum cleaner", "vase", "vaulted or arched ceiling", "velvet fabric", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock", "wallet", "wardrobe", "military aircraft", "sink", "washing machine", "water bottle", "water jug", "water tower", "whiskey jug", "whistle", "hair wig", "window screen", "window shade", "Windsor tie", "wine bottle", "airplane wing", "wok", "wooden spoon", "wool", "split-rail fence", "shipwreck", "sailboat", "yurt", "website", "comic book", "crossword", "traffic or street sign", "traffic light", "dust jacket", "menu", "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "popsicle", "baguette", "bagel", "pretzel", "cheeseburger", "hot dog", "mashed potatoes", "cabbage", "broccoli", "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith apple", "strawberry", "orange", "lemon", "fig", "pineapple", "banana", "jackfruit", "cherimoya (custard apple)", "pomegranate", "hay", "carbonara", "chocolate syrup", "dough", "meatloaf", "pizza", "pot pie", "burrito", "red wine", "espresso", "tea cup", "eggnog", "mountain", "bubble", "cliff", "coral reef", "geyser", "lakeshore", "promontory", "sandbar", "beach", "valley", "volcano", "baseball player", "bridegroom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn", "rose hip", "horse chestnut seed", "coral fungus", "agaric", "gyromitra", "stinkhorn mushroom", "earth star fungus", "hen of the woods mushroom", "bolete", "corn cob", "toilet paper"]
17
- # main_languages = ['EN'] + sorted(["WUU", "NV", "CV", "DIQ", "CHR", "CE", "HAK", "NAH", 'NE', 'ID', 'DE', 'NL', 'AF', 'HE', 'SQ', 'UZ', 'KN', 'KU', 'TA', 'LV', 'KO', 'UG', 'BR', 'EL', 'SU', 'KK', 'SK', 'GL', 'OM', 'FA', 'JV', 'CS', 'LO', 'HY', 'XH', 'HR', 'SO', 'GU', 'AM', 'AR', 'SA', 'CA', 'IS', 'IT', 'SV', 'GA', 'BG', 'VI', 'SD', 'UR', 'KM', 'PL', 'HU', 'SR', 'FR', 'HI', 'FY', 'ET', 'BS', 'SW', 'AZ', 'MK', 'ES', 'MN', 'JA', 'TL', 'TR', 'GD', 'RO', 'MG', 'MR', 'SL', 'PT', 'LT', 'NO', 'YI', 'UK', 'KY', 'KA', 'BN', 'OR', 'MY', 'PS', 'FI', 'ZH', 'DA', 'ML', 'BE', 'EO', 'HA', 'EU', 'AS', 'TE', 'TH', 'CY', 'SI', 'RU', 'LA', 'PA', 'MS'])
18
- language_names = json.load(open("data/language_mapping.json", encoding="utf-8"))
19
- main_language_values = sorted([[name, code] for code, name in language_names.items()], key=lambda x: x[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # [[main_language_names[lang], lang] for lang in main_languages+sorted(l for l in main_language_names if l not in main_languages)]
21
 
22
  babel_imagenet = json.load(open("data/babel_imagenet-298.json", encoding="utf-8"))
23
  babelnet_images = json.load(open("data/images.json", encoding="utf-8"))
24
- max_image_choices = 10 # Currently up to 30 images but relevance degrades quickly in my experience. Limiting to 10
25
  no_image_idxs = [i for i, imgs in enumerate(babelnet_images) if len(imgs) == 0]
26
  IMG_HEIGHT, IMG_WIDTH = 512, 512
27
 
@@ -29,17 +1033,22 @@ precomputed_results = None
29
  if os.path.exists("data/precomputed_results.json"):
30
  precomputed_results = json.load(open("data/precomputed_results.json"))
31
 
32
- request_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
 
 
33
  ### Loading model; hard-coded to mSigLIP for now.
34
 
35
  if not precomputed_results:
36
  open_clip_model, open_clip_pretrained = "ViT-B-16-SigLIP-i18n-256", "webli"
37
- model, _, transform = open_clip.create_model_and_transforms(open_clip_model, pretrained=open_clip_pretrained)
 
 
38
  tokenizer = open_clip.get_tokenizer(open_clip_model)
39
 
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
  model = model.to(device)
42
 
 
43
  def change_language(lang, randomize_imgs, randomize_labels):
44
  # compute text embeddings
45
  labels = babel_imagenet[lang][1]
@@ -55,11 +1064,23 @@ def change_language(lang, randomize_imgs, randomize_labels):
55
  text_features = text_features.cpu().numpy()
56
  else:
57
  text_features = None
58
- correct_text = gr.Text(f"Correct was: ''. Question 1/{len(babel_imagenet[lang][0])} ", label="Game")
 
 
59
  player_score_text = gr.Text(f"Your choice: (Score: 0) ", label="Player")
60
  clip_score_text = gr.Text(f"mSigLIP chose: '' (Score: 0)", label="Opponent")
61
 
62
- return text_features, -1, class_order, correct_text, player_score_text, clip_score_text, 0, 0
 
 
 
 
 
 
 
 
 
 
63
 
64
  def select(idx, lang, choice, correct, model_choice, player_score, clip_score, choices):
65
  # checks if answer choice is correct and updated scores
@@ -73,16 +1094,26 @@ def select(idx, lang, choice, correct, model_choice, player_score, clip_score, c
73
  player_score = player_score + int(player_correct)
74
  clip_score = clip_score + int(model_correct)
75
 
76
- correct_text = gr.Text(f"Correct was: '{correct_name}'. Question {idx+1}/{len(babel_imagenet[lang][0])} ", label="Game")
77
- player_score_text = gr.Text(f"Your choice: {player_choice} {'✅' if player_correct else '❌'} (Score: {player_score}) ", label="Player")
78
- clip_score_text = gr.Text(f"mSigLIP chose: '{model_choice_name}' {'✅' if model_correct else '❌'} (Score: {clip_score})", label="Opponent")
 
 
 
 
 
 
 
 
 
79
 
80
  return correct_text, player_score_text, clip_score_text, player_score, clip_score
81
 
 
82
  def prepare(raw_idx, lang, text_embeddings, class_order, randomize_images):
83
  # prepared next question, loads image, and computes choices
84
 
85
- raw_idx = (raw_idx+1) % len(babel_imagenet[lang][0])
86
  idx = class_order[raw_idx]
87
  lang_class_idxs = babel_imagenet[lang][0]
88
  class_idx = lang_class_idxs[idx]
@@ -96,24 +1127,38 @@ def prepare(raw_idx, lang, text_embeddings, class_order, randomize_images):
96
 
97
  img_idx = 0
98
  if randomize_images:
99
- img_idx = np.random.choice(min(len(babelnet_images[class_idx]), max_image_choices))
 
 
100
  img_url = babelnet_images[class_idx][img_idx]["url"]
101
  class_labels = babel_imagenet[lang][1] if lang != "EN" else openai_en_classes
102
 
103
  if not precomputed_results:
104
  try:
105
- image_input = transform(Image.open(requests.get(img_url, stream=True, headers=request_header).raw).convert("RGB")).unsqueeze(0).to(device)
 
 
 
 
 
 
 
 
106
  with torch.no_grad():
107
  image_features = model.encode_image(image_input).float()
108
  image_features /= image_features.norm(dim=-1, keepdim=True)
109
  except:
110
  gr.Warning("There is a problem with the next class. Skipping it.")
111
- return prepare(raw_idx, lang, text_embeddings, class_order, randomize_images)
 
 
112
 
113
  similarity = (text_embeddings @ image_features.cpu().numpy().T).squeeze()
114
- choices = np.argsort(similarity)[-2:].tolist()
115
  else:
116
- choices = list(reversed(precomputed_results[lang][idx][img_idx])) # precomputing script uses torch.topk which sorts in reverse here
 
 
117
  if idx not in choices:
118
  choices = [idx] + choices[1:]
119
  model_choice_idx = choices[-1]
@@ -121,17 +1166,30 @@ def prepare(raw_idx, lang, text_embeddings, class_order, randomize_images):
121
  numpy.random.shuffle(choices)
122
 
123
  choice_names = [class_labels[idx] for idx in choices]
124
- choice_values = [0, 1]
125
 
126
  model_choice_idx = choices.index(model_choice_idx)
127
  model_choice = [choice_names[model_choice_idx], choice_values[model_choice_idx]]
128
  correct_choice_idx = choices.index(idx)
129
- correct_choice = [choice_names[correct_choice_idx], choice_values[correct_choice_idx]]
 
 
 
130
 
131
  choice_values = list(zip(choice_names, choice_values))
132
 
133
- next_radio = gr.Radio(choices=choice_values, interactive=True, label="Select the correct answer:", value=None)
134
- next_image = gr.Image(value=img_url, width=IMG_WIDTH, height=IMG_WIDTH, label="What class does this image belong to?")
 
 
 
 
 
 
 
 
 
 
135
 
136
  return next_radio, next_image, raw_idx, correct_choice, model_choice, choice_values
137
 
@@ -145,24 +1203,38 @@ def reroll(raw_idx, lang, text_embeddings, class_order, randomize_images):
145
 
146
  img_idx = 0
147
  if randomize_images:
148
- img_idx = np.random.choice(min(len(babelnet_images[class_idx]), max_image_choices))
 
 
149
  img_url = babelnet_images[class_idx][img_idx]["url"]
150
  class_labels = babel_imagenet[lang][1] if lang != "EN" else openai_en_classes
151
 
152
  if not precomputed_results:
153
  try:
154
- image_input = transform(Image.open(requests.get(img_url, stream=True, headers=request_header).raw).convert("RGB")).unsqueeze(0).to(device)
 
 
 
 
 
 
 
 
155
  with torch.no_grad():
156
  image_features = model.encode_image(image_input).float()
157
  image_features /= image_features.norm(dim=-1, keepdim=True)
158
  except:
159
  gr.Warning("There is a problem with the next class. Skipping it.")
160
- return prepare(raw_idx, lang, text_embeddings, class_order, randomize_images)
 
 
161
 
162
  similarity = (text_embeddings @ image_features.cpu().numpy().T).squeeze()
163
- choices = np.argsort(similarity)[-2:].tolist()
164
  else:
165
- choices = list(reversed(precomputed_results[lang][idx][img_idx])) # precomputing script uses torch.topk which sorts in reverse here
 
 
166
  if idx not in choices:
167
  choices = [idx] + choices[1:]
168
  model_choice_idx = choices[-1]
@@ -170,22 +1242,35 @@ def reroll(raw_idx, lang, text_embeddings, class_order, randomize_images):
170
  numpy.random.shuffle(choices)
171
 
172
  choice_names = [class_labels[idx] for idx in choices]
173
- choice_values = [0, 1]
174
 
175
  model_choice_idx = choices.index(model_choice_idx)
176
  model_choice = [choice_names[model_choice_idx], choice_values[model_choice_idx]]
177
  correct_choice_idx = choices.index(idx)
178
- correct_choice = [choice_names[correct_choice_idx], choice_values[correct_choice_idx]]
 
 
 
179
 
180
  choice_values = list(zip(choice_names, choice_values))
181
 
182
- next_radio = gr.Radio(choices=choice_values, interactive=True, label="Select the correct answer:", value=None)
183
- next_image = gr.Image(value=img_url, width=IMG_WIDTH, height=IMG_WIDTH, label="What class does this image belong to?")
 
 
 
 
 
 
 
 
 
 
184
 
185
  return next_radio, next_image, raw_idx, correct_choice, model_choice, choice_values
186
 
187
 
188
- with (gr.Blocks(title="Babel-ImageNet Quiz") as demo):
189
 
190
  # setup state
191
  class_idx = gr.State(-1)
@@ -195,11 +1280,12 @@ with (gr.Blocks(title="Babel-ImageNet Quiz") as demo):
195
  choices = gr.State([])
196
 
197
  text_embeddings = gr.State(None)
198
- correct_choice = gr.State(["nan", 0]) # 0, 1, 2, 3
199
  model_choice = gr.State(["nan", 0])
200
 
201
  # Title Area
202
- gr.Markdown("""
 
203
  # Are you smarter🤓 than CLIP🤖? Take the [ Babel-ImageNet ](https://arxiv.org/abs/2306.08658) Quiz!
204
 
205
  <small>by Gregor Geigle, WüNLP & Computer Vision Lab, University of Würzburg</small>
@@ -217,10 +1303,12 @@ Select your language, click 'Start' and start guessing! We'll keep track of your
217
  <p><b>'Who is my opponent?'</b> Your opponent CLIP model is [mSigLIP](https://huggingface.co/timm/ViT-B-16-SigLIP-i18n-256), a powerful but small multilingual model with only 370M parameters.</p>
218
  <p><b>'My game crashed/ I got an error!'</b> This usually happens because of problems with the image URLs. You can try the button to reroll the image or start a new round by clicking the 'Start' button again.</p>
219
  </details>
220
- """)
 
221
  with gr.Row():
222
  with gr.Column(scale=1):
223
- gr.Markdown("""
 
224
  <details>
225
  <summary> <b>What is CLIP? </b> (click me to read)</summary>
226
  <p>
@@ -231,9 +1319,11 @@ Select your language, click 'Start' and start guessing! We'll keep track of your
231
  Your opponent CLIP model [mSigLIP](https://arxiv.org/abs/2303.15343) in this quiz does 'zero-shot image classification': We encode all possible class labels and the image and we check which class is most similar; this is then the class chosen by CLIP.
232
  </p>
233
  </details>
234
- """)
 
235
  with gr.Column(scale=1):
236
- gr.Markdown("""
 
237
  <details>
238
  <summary> <b>What is ImageNet? </b> (click me to read)</summary>
239
  <p>
@@ -241,9 +1331,11 @@ Select your language, click 'Start' and start guessing! We'll keep track of your
241
  It is a very popular dataset used to benchmark CLIP models because strong results here usually indicates that the image model is overall usefull for many tasks.
242
  </p>
243
  </details>
244
- """)
 
245
  with gr.Column(scale=1):
246
- gr.Markdown("""
 
247
  <details>
248
  <summary> <b>What is Babel-ImageNet? </b> (click me to read)</summary>
249
  <p>
@@ -256,48 +1348,111 @@ Select your language, click 'Start' and start guessing! We'll keep track of your
256
  For more details, please read our <a href='https://arxiv.org/abs/2306.08658'>paper.</a>
257
  </p>
258
  </details>
259
- """)
 
260
  # language select dropdown
261
  with gr.Row():
262
- language_select = gr.Dropdown(choices=main_language_values, value="EN", interactive=True, label="Select your language:")
263
- randomize_classes = gr.Checkbox(label="Randomize class order (or play in canonic order)", value=True)
264
- randomize_images = gr.Checkbox(label="Randomize images (if unchecked, will always show the same image). Other images might be less relevant.", value=True)
 
 
 
 
 
 
 
 
 
 
265
  start_btn = gr.Button(value="Start", variant="primary")
266
 
267
  # quiz area
268
  with gr.Row():
269
  with gr.Column(scale=1):
270
- image = gr.Image(value="data/bin_image.png",
271
- width=IMG_WIDTH, height=IMG_WIDTH)
 
272
 
273
  with gr.Column(scale=1):
274
- options = gr.Radio(choices=["Click", "start", "to", "begin"], interactive=False, label="Please click start to begin.")
 
 
 
 
275
  # with gr.Row():
276
  correct_text = gr.Text("Please click start to begin.")
277
  player_score_text = gr.Text(f"Player score: 0")
278
  clip_score_text = gr.Text(f"mSigLIP score: 0")
279
  reroll_btn = gr.Button(value="Reroll the image (for bad images or errors)")
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
-
284
- options.select(fn=select,
285
- inputs=[class_idx, language_select, options, correct_choice, model_choice, player_score, clip_score, choices],
286
- outputs=[correct_text, player_score_text, clip_score_text, player_score, clip_score]
287
- ).then(fn=prepare,
288
- inputs=[class_idx, language_select, text_embeddings, class_order, randomize_images],
289
- outputs=[options, image, class_idx, correct_choice, model_choice, choices])
290
-
291
- start_btn.click(fn=change_language,
292
- inputs=[language_select, randomize_images, randomize_classes],
293
- outputs=[text_embeddings, class_idx, class_order, correct_text, player_score_text, clip_score_text, player_score, clip_score]
294
- ).then(fn=prepare,
295
- inputs=[class_idx, language_select, text_embeddings, class_order, randomize_images],
296
- outputs=[options, image, class_idx, correct_choice, model_choice, choices])
297
-
298
- reroll_btn.click(fn=reroll,
299
- inputs=[class_idx, language_select, text_embeddings, class_order, randomize_images],
300
- outputs=[options, image, class_idx, correct_choice, model_choice, choices])
301
 
302
  # initialization
303
  # demo.load(fn=change_language,
@@ -308,5 +1463,4 @@ Select your language, click 'Start' and start guessing! We'll keep track of your
308
  # outputs=[options, image, class_idx, correct_choice, model_choice])
309
 
310
 
311
-
312
- demo.launch()
 
8
  from PIL import Image
9
  import requests
10
  import torch
11
+
12
  # import torch.nn.functional as F
13
  import numpy as np
14
 
15
 
16
  # GLOBAL VARIABLES
17
+ openai_en_classes = [
18
+ "tench",
19
+ "goldfish",
20
+ "great white shark",
21
+ "tiger shark",
22
+ "hammerhead shark",
23
+ "electric ray",
24
+ "stingray",
25
+ "rooster",
26
+ "hen",
27
+ "ostrich",
28
+ "brambling",
29
+ "goldfinch",
30
+ "house finch",
31
+ "junco",
32
+ "indigo bunting",
33
+ "American robin",
34
+ "bulbul",
35
+ "jay",
36
+ "magpie",
37
+ "chickadee",
38
+ "American dipper",
39
+ "kite (bird of prey)",
40
+ "bald eagle",
41
+ "vulture",
42
+ "great grey owl",
43
+ "fire salamander",
44
+ "smooth newt",
45
+ "newt",
46
+ "spotted salamander",
47
+ "axolotl",
48
+ "American bullfrog",
49
+ "tree frog",
50
+ "tailed frog",
51
+ "loggerhead sea turtle",
52
+ "leatherback sea turtle",
53
+ "mud turtle",
54
+ "terrapin",
55
+ "box turtle",
56
+ "banded gecko",
57
+ "green iguana",
58
+ "Carolina anole",
59
+ "desert grassland whiptail lizard",
60
+ "agama",
61
+ "frilled-necked lizard",
62
+ "alligator lizard",
63
+ "Gila monster",
64
+ "European green lizard",
65
+ "chameleon",
66
+ "Komodo dragon",
67
+ "Nile crocodile",
68
+ "American alligator",
69
+ "triceratops",
70
+ "worm snake",
71
+ "ring-necked snake",
72
+ "eastern hog-nosed snake",
73
+ "smooth green snake",
74
+ "kingsnake",
75
+ "garter snake",
76
+ "water snake",
77
+ "vine snake",
78
+ "night snake",
79
+ "boa constrictor",
80
+ "African rock python",
81
+ "Indian cobra",
82
+ "green mamba",
83
+ "sea snake",
84
+ "Saharan horned viper",
85
+ "eastern diamondback rattlesnake",
86
+ "sidewinder rattlesnake",
87
+ "trilobite",
88
+ "harvestman",
89
+ "scorpion",
90
+ "yellow garden spider",
91
+ "barn spider",
92
+ "European garden spider",
93
+ "southern black widow",
94
+ "tarantula",
95
+ "wolf spider",
96
+ "tick",
97
+ "centipede",
98
+ "black grouse",
99
+ "ptarmigan",
100
+ "ruffed grouse",
101
+ "prairie grouse",
102
+ "peafowl",
103
+ "quail",
104
+ "partridge",
105
+ "african grey parrot",
106
+ "macaw",
107
+ "sulphur-crested cockatoo",
108
+ "lorikeet",
109
+ "coucal",
110
+ "bee eater",
111
+ "hornbill",
112
+ "hummingbird",
113
+ "jacamar",
114
+ "toucan",
115
+ "duck",
116
+ "red-breasted merganser",
117
+ "goose",
118
+ "black swan",
119
+ "tusker",
120
+ "echidna",
121
+ "platypus",
122
+ "wallaby",
123
+ "koala",
124
+ "wombat",
125
+ "jellyfish",
126
+ "sea anemone",
127
+ "brain coral",
128
+ "flatworm",
129
+ "nematode",
130
+ "conch",
131
+ "snail",
132
+ "slug",
133
+ "sea slug",
134
+ "chiton",
135
+ "chambered nautilus",
136
+ "Dungeness crab",
137
+ "rock crab",
138
+ "fiddler crab",
139
+ "red king crab",
140
+ "American lobster",
141
+ "spiny lobster",
142
+ "crayfish",
143
+ "hermit crab",
144
+ "isopod",
145
+ "white stork",
146
+ "black stork",
147
+ "spoonbill",
148
+ "flamingo",
149
+ "little blue heron",
150
+ "great egret",
151
+ "bittern bird",
152
+ "crane bird",
153
+ "limpkin",
154
+ "common gallinule",
155
+ "American coot",
156
+ "bustard",
157
+ "ruddy turnstone",
158
+ "dunlin",
159
+ "common redshank",
160
+ "dowitcher",
161
+ "oystercatcher",
162
+ "pelican",
163
+ "king penguin",
164
+ "albatross",
165
+ "grey whale",
166
+ "killer whale",
167
+ "dugong",
168
+ "sea lion",
169
+ "Chihuahua",
170
+ "Japanese Chin",
171
+ "Maltese",
172
+ "Pekingese",
173
+ "Shih Tzu",
174
+ "King Charles Spaniel",
175
+ "Papillon",
176
+ "toy terrier",
177
+ "Rhodesian Ridgeback",
178
+ "Afghan Hound",
179
+ "Basset Hound",
180
+ "Beagle",
181
+ "Bloodhound",
182
+ "Bluetick Coonhound",
183
+ "Black and Tan Coonhound",
184
+ "Treeing Walker Coonhound",
185
+ "English foxhound",
186
+ "Redbone Coonhound",
187
+ "borzoi",
188
+ "Irish Wolfhound",
189
+ "Italian Greyhound",
190
+ "Whippet",
191
+ "Ibizan Hound",
192
+ "Norwegian Elkhound",
193
+ "Otterhound",
194
+ "Saluki",
195
+ "Scottish Deerhound",
196
+ "Weimaraner",
197
+ "Staffordshire Bull Terrier",
198
+ "American Staffordshire Terrier",
199
+ "Bedlington Terrier",
200
+ "Border Terrier",
201
+ "Kerry Blue Terrier",
202
+ "Irish Terrier",
203
+ "Norfolk Terrier",
204
+ "Norwich Terrier",
205
+ "Yorkshire Terrier",
206
+ "Wire Fox Terrier",
207
+ "Lakeland Terrier",
208
+ "Sealyham Terrier",
209
+ "Airedale Terrier",
210
+ "Cairn Terrier",
211
+ "Australian Terrier",
212
+ "Dandie Dinmont Terrier",
213
+ "Boston Terrier",
214
+ "Miniature Schnauzer",
215
+ "Giant Schnauzer",
216
+ "Standard Schnauzer",
217
+ "Scottish Terrier",
218
+ "Tibetan Terrier",
219
+ "Australian Silky Terrier",
220
+ "Soft-coated Wheaten Terrier",
221
+ "West Highland White Terrier",
222
+ "Lhasa Apso",
223
+ "Flat-Coated Retriever",
224
+ "Curly-coated Retriever",
225
+ "Golden Retriever",
226
+ "Labrador Retriever",
227
+ "Chesapeake Bay Retriever",
228
+ "German Shorthaired Pointer",
229
+ "Vizsla",
230
+ "English Setter",
231
+ "Irish Setter",
232
+ "Gordon Setter",
233
+ "Brittany dog",
234
+ "Clumber Spaniel",
235
+ "English Springer Spaniel",
236
+ "Welsh Springer Spaniel",
237
+ "Cocker Spaniel",
238
+ "Sussex Spaniel",
239
+ "Irish Water Spaniel",
240
+ "Kuvasz",
241
+ "Schipperke",
242
+ "Groenendael dog",
243
+ "Malinois",
244
+ "Briard",
245
+ "Australian Kelpie",
246
+ "Komondor",
247
+ "Old English Sheepdog",
248
+ "Shetland Sheepdog",
249
+ "collie",
250
+ "Border Collie",
251
+ "Bouvier des Flandres dog",
252
+ "Rottweiler",
253
+ "German Shepherd Dog",
254
+ "Dobermann",
255
+ "Miniature Pinscher",
256
+ "Greater Swiss Mountain Dog",
257
+ "Bernese Mountain Dog",
258
+ "Appenzeller Sennenhund",
259
+ "Entlebucher Sennenhund",
260
+ "Boxer",
261
+ "Bullmastiff",
262
+ "Tibetan Mastiff",
263
+ "French Bulldog",
264
+ "Great Dane",
265
+ "St. Bernard",
266
+ "husky",
267
+ "Alaskan Malamute",
268
+ "Siberian Husky",
269
+ "Dalmatian",
270
+ "Affenpinscher",
271
+ "Basenji",
272
+ "pug",
273
+ "Leonberger",
274
+ "Newfoundland dog",
275
+ "Great Pyrenees dog",
276
+ "Samoyed",
277
+ "Pomeranian",
278
+ "Chow Chow",
279
+ "Keeshond",
280
+ "brussels griffon",
281
+ "Pembroke Welsh Corgi",
282
+ "Cardigan Welsh Corgi",
283
+ "Toy Poodle",
284
+ "Miniature Poodle",
285
+ "Standard Poodle",
286
+ "Mexican hairless dog (xoloitzcuintli)",
287
+ "grey wolf",
288
+ "Alaskan tundra wolf",
289
+ "red wolf or maned wolf",
290
+ "coyote",
291
+ "dingo",
292
+ "dhole",
293
+ "African wild dog",
294
+ "hyena",
295
+ "red fox",
296
+ "kit fox",
297
+ "Arctic fox",
298
+ "grey fox",
299
+ "tabby cat",
300
+ "tiger cat",
301
+ "Persian cat",
302
+ "Siamese cat",
303
+ "Egyptian Mau",
304
+ "cougar",
305
+ "lynx",
306
+ "leopard",
307
+ "snow leopard",
308
+ "jaguar",
309
+ "lion",
310
+ "tiger",
311
+ "cheetah",
312
+ "brown bear",
313
+ "American black bear",
314
+ "polar bear",
315
+ "sloth bear",
316
+ "mongoose",
317
+ "meerkat",
318
+ "tiger beetle",
319
+ "ladybug",
320
+ "ground beetle",
321
+ "longhorn beetle",
322
+ "leaf beetle",
323
+ "dung beetle",
324
+ "rhinoceros beetle",
325
+ "weevil",
326
+ "fly",
327
+ "bee",
328
+ "ant",
329
+ "grasshopper",
330
+ "cricket insect",
331
+ "stick insect",
332
+ "cockroach",
333
+ "praying mantis",
334
+ "cicada",
335
+ "leafhopper",
336
+ "lacewing",
337
+ "dragonfly",
338
+ "damselfly",
339
+ "red admiral butterfly",
340
+ "ringlet butterfly",
341
+ "monarch butterfly",
342
+ "small white butterfly",
343
+ "sulphur butterfly",
344
+ "gossamer-winged butterfly",
345
+ "starfish",
346
+ "sea urchin",
347
+ "sea cucumber",
348
+ "cottontail rabbit",
349
+ "hare",
350
+ "Angora rabbit",
351
+ "hamster",
352
+ "porcupine",
353
+ "fox squirrel",
354
+ "marmot",
355
+ "beaver",
356
+ "guinea pig",
357
+ "common sorrel horse",
358
+ "zebra",
359
+ "pig",
360
+ "wild boar",
361
+ "warthog",
362
+ "hippopotamus",
363
+ "ox",
364
+ "water buffalo",
365
+ "bison",
366
+ "ram (adult male sheep)",
367
+ "bighorn sheep",
368
+ "Alpine ibex",
369
+ "hartebeest",
370
+ "impala (antelope)",
371
+ "gazelle",
372
+ "arabian camel",
373
+ "llama",
374
+ "weasel",
375
+ "mink",
376
+ "European polecat",
377
+ "black-footed ferret",
378
+ "otter",
379
+ "skunk",
380
+ "badger",
381
+ "armadillo",
382
+ "three-toed sloth",
383
+ "orangutan",
384
+ "gorilla",
385
+ "chimpanzee",
386
+ "gibbon",
387
+ "siamang",
388
+ "guenon",
389
+ "patas monkey",
390
+ "baboon",
391
+ "macaque",
392
+ "langur",
393
+ "black-and-white colobus",
394
+ "proboscis monkey",
395
+ "marmoset",
396
+ "white-headed capuchin",
397
+ "howler monkey",
398
+ "titi monkey",
399
+ "Geoffroy's spider monkey",
400
+ "common squirrel monkey",
401
+ "ring-tailed lemur",
402
+ "indri",
403
+ "Asian elephant",
404
+ "African bush elephant",
405
+ "red panda",
406
+ "giant panda",
407
+ "snoek fish",
408
+ "eel",
409
+ "silver salmon",
410
+ "rock beauty fish",
411
+ "clownfish",
412
+ "sturgeon",
413
+ "gar fish",
414
+ "lionfish",
415
+ "pufferfish",
416
+ "abacus",
417
+ "abaya",
418
+ "academic gown",
419
+ "accordion",
420
+ "acoustic guitar",
421
+ "aircraft carrier",
422
+ "airliner",
423
+ "airship",
424
+ "altar",
425
+ "ambulance",
426
+ "amphibious vehicle",
427
+ "analog clock",
428
+ "apiary",
429
+ "apron",
430
+ "trash can",
431
+ "assault rifle",
432
+ "backpack",
433
+ "bakery",
434
+ "balance beam",
435
+ "balloon",
436
+ "ballpoint pen",
437
+ "Band-Aid",
438
+ "banjo",
439
+ "baluster / handrail",
440
+ "barbell",
441
+ "barber chair",
442
+ "barbershop",
443
+ "barn",
444
+ "barometer",
445
+ "barrel",
446
+ "wheelbarrow",
447
+ "baseball",
448
+ "basketball",
449
+ "bassinet",
450
+ "bassoon",
451
+ "swimming cap",
452
+ "bath towel",
453
+ "bathtub",
454
+ "station wagon",
455
+ "lighthouse",
456
+ "beaker",
457
+ "military hat (bearskin or shako)",
458
+ "beer bottle",
459
+ "beer glass",
460
+ "bell tower",
461
+ "baby bib",
462
+ "tandem bicycle",
463
+ "bikini",
464
+ "ring binder",
465
+ "binoculars",
466
+ "birdhouse",
467
+ "boathouse",
468
+ "bobsleigh",
469
+ "bolo tie",
470
+ "poke bonnet",
471
+ "bookcase",
472
+ "bookstore",
473
+ "bottle cap",
474
+ "hunting bow",
475
+ "bow tie",
476
+ "brass memorial plaque",
477
+ "bra",
478
+ "breakwater",
479
+ "breastplate",
480
+ "broom",
481
+ "bucket",
482
+ "buckle",
483
+ "bulletproof vest",
484
+ "high-speed train",
485
+ "butcher shop",
486
+ "taxicab",
487
+ "cauldron",
488
+ "candle",
489
+ "cannon",
490
+ "canoe",
491
+ "can opener",
492
+ "cardigan",
493
+ "car mirror",
494
+ "carousel",
495
+ "tool kit",
496
+ "cardboard box / carton",
497
+ "car wheel",
498
+ "automated teller machine",
499
+ "cassette",
500
+ "cassette player",
501
+ "castle",
502
+ "catamaran",
503
+ "CD player",
504
+ "cello",
505
+ "mobile phone",
506
+ "chain",
507
+ "chain-link fence",
508
+ "chain mail",
509
+ "chainsaw",
510
+ "storage chest",
511
+ "chiffonier",
512
+ "bell or wind chime",
513
+ "china cabinet",
514
+ "Christmas stocking",
515
+ "church",
516
+ "movie theater",
517
+ "cleaver",
518
+ "cliff dwelling",
519
+ "cloak",
520
+ "clogs",
521
+ "cocktail shaker",
522
+ "coffee mug",
523
+ "coffeemaker",
524
+ "spiral or coil",
525
+ "combination lock",
526
+ "computer keyboard",
527
+ "candy store",
528
+ "container ship",
529
+ "convertible",
530
+ "corkscrew",
531
+ "cornet",
532
+ "cowboy boot",
533
+ "cowboy hat",
534
+ "cradle",
535
+ "construction crane",
536
+ "crash helmet",
537
+ "crate",
538
+ "infant bed",
539
+ "Crock Pot",
540
+ "croquet ball",
541
+ "crutch",
542
+ "cuirass",
543
+ "dam",
544
+ "desk",
545
+ "desktop computer",
546
+ "rotary dial telephone",
547
+ "diaper",
548
+ "digital clock",
549
+ "digital watch",
550
+ "dining table",
551
+ "dishcloth",
552
+ "dishwasher",
553
+ "disc brake",
554
+ "dock",
555
+ "dog sled",
556
+ "dome",
557
+ "doormat",
558
+ "drilling rig",
559
+ "drum",
560
+ "drumstick",
561
+ "dumbbell",
562
+ "Dutch oven",
563
+ "electric fan",
564
+ "electric guitar",
565
+ "electric locomotive",
566
+ "entertainment center",
567
+ "envelope",
568
+ "espresso machine",
569
+ "face powder",
570
+ "feather boa",
571
+ "filing cabinet",
572
+ "fireboat",
573
+ "fire truck",
574
+ "fire screen",
575
+ "flagpole",
576
+ "flute",
577
+ "folding chair",
578
+ "football helmet",
579
+ "forklift",
580
+ "fountain",
581
+ "fountain pen",
582
+ "four-poster bed",
583
+ "freight car",
584
+ "French horn",
585
+ "frying pan",
586
+ "fur coat",
587
+ "garbage truck",
588
+ "gas mask or respirator",
589
+ "gas pump",
590
+ "goblet",
591
+ "go-kart",
592
+ "golf ball",
593
+ "golf cart",
594
+ "gondola",
595
+ "gong",
596
+ "gown",
597
+ "grand piano",
598
+ "greenhouse",
599
+ "radiator grille",
600
+ "grocery store",
601
+ "guillotine",
602
+ "hair clip",
603
+ "hair spray",
604
+ "half-track",
605
+ "hammer",
606
+ "hamper",
607
+ "hair dryer",
608
+ "hand-held computer",
609
+ "handkerchief",
610
+ "hard disk drive",
611
+ "harmonica",
612
+ "harp",
613
+ "combine harvester",
614
+ "hatchet",
615
+ "holster",
616
+ "home theater",
617
+ "honeycomb",
618
+ "hook",
619
+ "hoop skirt",
620
+ "gymnastic horizontal bar",
621
+ "horse-drawn vehicle",
622
+ "hourglass",
623
+ "iPod",
624
+ "clothes iron",
625
+ "carved pumpkin",
626
+ "jeans",
627
+ "jeep",
628
+ "T-shirt",
629
+ "jigsaw puzzle",
630
+ "rickshaw",
631
+ "joystick",
632
+ "kimono",
633
+ "knee pad",
634
+ "knot",
635
+ "lab coat",
636
+ "ladle",
637
+ "lampshade",
638
+ "laptop computer",
639
+ "lawn mower",
640
+ "lens cap",
641
+ "letter opener",
642
+ "library",
643
+ "lifeboat",
644
+ "lighter",
645
+ "limousine",
646
+ "ocean liner",
647
+ "lipstick",
648
+ "slip-on shoe",
649
+ "lotion",
650
+ "music speaker",
651
+ "loupe magnifying glass",
652
+ "sawmill",
653
+ "magnetic compass",
654
+ "messenger bag",
655
+ "mailbox",
656
+ "tights",
657
+ "one-piece bathing suit",
658
+ "manhole cover",
659
+ "maraca",
660
+ "marimba",
661
+ "mask",
662
+ "matchstick",
663
+ "maypole",
664
+ "maze",
665
+ "measuring cup",
666
+ "medicine cabinet",
667
+ "megalith",
668
+ "microphone",
669
+ "microwave oven",
670
+ "military uniform",
671
+ "milk can",
672
+ "minibus",
673
+ "miniskirt",
674
+ "minivan",
675
+ "missile",
676
+ "mitten",
677
+ "mixing bowl",
678
+ "mobile home",
679
+ "ford model t",
680
+ "modem",
681
+ "monastery",
682
+ "monitor",
683
+ "moped",
684
+ "mortar and pestle",
685
+ "graduation cap",
686
+ "mosque",
687
+ "mosquito net",
688
+ "vespa",
689
+ "mountain bike",
690
+ "tent",
691
+ "computer mouse",
692
+ "mousetrap",
693
+ "moving van",
694
+ "muzzle",
695
+ "metal nail",
696
+ "neck brace",
697
+ "necklace",
698
+ "baby pacifier",
699
+ "notebook computer",
700
+ "obelisk",
701
+ "oboe",
702
+ "ocarina",
703
+ "odometer",
704
+ "oil filter",
705
+ "pipe organ",
706
+ "oscilloscope",
707
+ "overskirt",
708
+ "bullock cart",
709
+ "oxygen mask",
710
+ "product packet / packaging",
711
+ "paddle",
712
+ "paddle wheel",
713
+ "padlock",
714
+ "paintbrush",
715
+ "pajamas",
716
+ "palace",
717
+ "pan flute",
718
+ "paper towel",
719
+ "parachute",
720
+ "parallel bars",
721
+ "park bench",
722
+ "parking meter",
723
+ "railroad car",
724
+ "patio",
725
+ "payphone",
726
+ "pedestal",
727
+ "pencil case",
728
+ "pencil sharpener",
729
+ "perfume",
730
+ "Petri dish",
731
+ "photocopier",
732
+ "plectrum",
733
+ "Pickelhaube",
734
+ "picket fence",
735
+ "pickup truck",
736
+ "pier",
737
+ "piggy bank",
738
+ "pill bottle",
739
+ "pillow",
740
+ "ping-pong ball",
741
+ "pinwheel",
742
+ "pirate ship",
743
+ "drink pitcher",
744
+ "block plane",
745
+ "planetarium",
746
+ "plastic bag",
747
+ "plate rack",
748
+ "farm plow",
749
+ "plunger",
750
+ "Polaroid camera",
751
+ "pole",
752
+ "police van",
753
+ "poncho",
754
+ "pool table",
755
+ "soda bottle",
756
+ "plant pot",
757
+ "potter's wheel",
758
+ "power drill",
759
+ "prayer rug",
760
+ "printer",
761
+ "prison",
762
+ "missile",
763
+ "projector",
764
+ "hockey puck",
765
+ "punching bag",
766
+ "purse",
767
+ "quill",
768
+ "quilt",
769
+ "race car",
770
+ "racket",
771
+ "radiator",
772
+ "radio",
773
+ "radio telescope",
774
+ "rain barrel",
775
+ "recreational vehicle",
776
+ "fishing casting reel",
777
+ "reflex camera",
778
+ "refrigerator",
779
+ "remote control",
780
+ "restaurant",
781
+ "revolver",
782
+ "rifle",
783
+ "rocking chair",
784
+ "rotisserie",
785
+ "eraser",
786
+ "rugby ball",
787
+ "ruler measuring stick",
788
+ "sneaker",
789
+ "safe",
790
+ "safety pin",
791
+ "salt shaker",
792
+ "sandal",
793
+ "sarong",
794
+ "saxophone",
795
+ "scabbard",
796
+ "weighing scale",
797
+ "school bus",
798
+ "schooner",
799
+ "scoreboard",
800
+ "CRT monitor",
801
+ "screw",
802
+ "screwdriver",
803
+ "seat belt",
804
+ "sewing machine",
805
+ "shield",
806
+ "shoe store",
807
+ "shoji screen / room divider",
808
+ "shopping basket",
809
+ "shopping cart",
810
+ "shovel",
811
+ "shower cap",
812
+ "shower curtain",
813
+ "ski",
814
+ "balaclava ski mask",
815
+ "sleeping bag",
816
+ "slide rule",
817
+ "sliding door",
818
+ "slot machine",
819
+ "snorkel",
820
+ "snowmobile",
821
+ "snowplow",
822
+ "soap dispenser",
823
+ "soccer ball",
824
+ "sock",
825
+ "solar thermal collector",
826
+ "sombrero",
827
+ "soup bowl",
828
+ "keyboard space bar",
829
+ "space heater",
830
+ "space shuttle",
831
+ "spatula",
832
+ "motorboat",
833
+ "spider web",
834
+ "spindle",
835
+ "sports car",
836
+ "spotlight",
837
+ "stage",
838
+ "steam locomotive",
839
+ "through arch bridge",
840
+ "steel drum",
841
+ "stethoscope",
842
+ "scarf",
843
+ "stone wall",
844
+ "stopwatch",
845
+ "stove",
846
+ "strainer",
847
+ "tram",
848
+ "stretcher",
849
+ "couch",
850
+ "stupa",
851
+ "submarine",
852
+ "suit",
853
+ "sundial",
854
+ "sunglasses",
855
+ "sunglasses",
856
+ "sunscreen",
857
+ "suspension bridge",
858
+ "mop",
859
+ "sweatshirt",
860
+ "swim trunks / shorts",
861
+ "swing",
862
+ "electrical switch",
863
+ "syringe",
864
+ "table lamp",
865
+ "tank",
866
+ "tape player",
867
+ "teapot",
868
+ "teddy bear",
869
+ "television",
870
+ "tennis ball",
871
+ "thatched roof",
872
+ "front curtain",
873
+ "thimble",
874
+ "threshing machine",
875
+ "throne",
876
+ "tile roof",
877
+ "toaster",
878
+ "tobacco shop",
879
+ "toilet seat",
880
+ "torch",
881
+ "totem pole",
882
+ "tow truck",
883
+ "toy store",
884
+ "tractor",
885
+ "semi-trailer truck",
886
+ "tray",
887
+ "trench coat",
888
+ "tricycle",
889
+ "trimaran",
890
+ "tripod",
891
+ "triumphal arch",
892
+ "trolleybus",
893
+ "trombone",
894
+ "hot tub",
895
+ "turnstile",
896
+ "typewriter keyboard",
897
+ "umbrella",
898
+ "unicycle",
899
+ "upright piano",
900
+ "vacuum cleaner",
901
+ "vase",
902
+ "vaulted or arched ceiling",
903
+ "velvet fabric",
904
+ "vending machine",
905
+ "vestment",
906
+ "viaduct",
907
+ "violin",
908
+ "volleyball",
909
+ "waffle iron",
910
+ "wall clock",
911
+ "wallet",
912
+ "wardrobe",
913
+ "military aircraft",
914
+ "sink",
915
+ "washing machine",
916
+ "water bottle",
917
+ "water jug",
918
+ "water tower",
919
+ "whiskey jug",
920
+ "whistle",
921
+ "hair wig",
922
+ "window screen",
923
+ "window shade",
924
+ "Windsor tie",
925
+ "wine bottle",
926
+ "airplane wing",
927
+ "wok",
928
+ "wooden spoon",
929
+ "wool",
930
+ "split-rail fence",
931
+ "shipwreck",
932
+ "sailboat",
933
+ "yurt",
934
+ "website",
935
+ "comic book",
936
+ "crossword",
937
+ "traffic or street sign",
938
+ "traffic light",
939
+ "dust jacket",
940
+ "menu",
941
+ "plate",
942
+ "guacamole",
943
+ "consomme",
944
+ "hot pot",
945
+ "trifle",
946
+ "ice cream",
947
+ "popsicle",
948
+ "baguette",
949
+ "bagel",
950
+ "pretzel",
951
+ "cheeseburger",
952
+ "hot dog",
953
+ "mashed potatoes",
954
+ "cabbage",
955
+ "broccoli",
956
+ "cauliflower",
957
+ "zucchini",
958
+ "spaghetti squash",
959
+ "acorn squash",
960
+ "butternut squash",
961
+ "cucumber",
962
+ "artichoke",
963
+ "bell pepper",
964
+ "cardoon",
965
+ "mushroom",
966
+ "Granny Smith apple",
967
+ "strawberry",
968
+ "orange",
969
+ "lemon",
970
+ "fig",
971
+ "pineapple",
972
+ "banana",
973
+ "jackfruit",
974
+ "cherimoya (custard apple)",
975
+ "pomegranate",
976
+ "hay",
977
+ "carbonara",
978
+ "chocolate syrup",
979
+ "dough",
980
+ "meatloaf",
981
+ "pizza",
982
+ "pot pie",
983
+ "burrito",
984
+ "red wine",
985
+ "espresso",
986
+ "tea cup",
987
+ "eggnog",
988
+ "mountain",
989
+ "bubble",
990
+ "cliff",
991
+ "coral reef",
992
+ "geyser",
993
+ "lakeshore",
994
+ "promontory",
995
+ "sandbar",
996
+ "beach",
997
+ "valley",
998
+ "volcano",
999
+ "baseball player",
1000
+ "bridegroom",
1001
+ "scuba diver",
1002
+ "rapeseed",
1003
+ "daisy",
1004
+ "yellow lady's slipper",
1005
+ "corn",
1006
+ "acorn",
1007
+ "rose hip",
1008
+ "horse chestnut seed",
1009
+ "coral fungus",
1010
+ "agaric",
1011
+ "gyromitra",
1012
+ "stinkhorn mushroom",
1013
+ "earth star fungus",
1014
+ "hen of the woods mushroom",
1015
+ "bolete",
1016
+ "corn cob",
1017
+ "toilet paper",
1018
+ ]
1019
+
1020
+ # language_names = json.load(open("data/language_mapping.json", encoding="utf-8"))
1021
+ # main_language_values = sorted(
1022
+ # [[name, code] for code, name in language_names.items()], key=lambda x: x[0]
1023
+ # )
1024
  # [[main_language_names[lang], lang] for lang in main_languages+sorted(l for l in main_language_names if l not in main_languages)]
1025
 
1026
  babel_imagenet = json.load(open("data/babel_imagenet-298.json", encoding="utf-8"))
1027
  babelnet_images = json.load(open("data/images.json", encoding="utf-8"))
1028
+ max_image_choices = 10 # Currently up to 30 images but relevance degrades quickly in my experience. Limiting to 10
1029
  no_image_idxs = [i for i, imgs in enumerate(babelnet_images) if len(imgs) == 0]
1030
  IMG_HEIGHT, IMG_WIDTH = 512, 512
1031
 
 
1033
  if os.path.exists("data/precomputed_results.json"):
1034
  precomputed_results = json.load(open("data/precomputed_results.json"))
1035
 
1036
+ request_header = {
1037
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
1038
+ }
1039
  ### Loading model; hard-coded to mSigLIP for now.
1040
 
1041
  if not precomputed_results:
1042
  open_clip_model, open_clip_pretrained = "ViT-B-16-SigLIP-i18n-256", "webli"
1043
+ model, _, transform = open_clip.create_model_and_transforms(
1044
+ open_clip_model, pretrained=open_clip_pretrained
1045
+ )
1046
  tokenizer = open_clip.get_tokenizer(open_clip_model)
1047
 
1048
  device = "cuda" if torch.cuda.is_available() else "cpu"
1049
  model = model.to(device)
1050
 
1051
+
1052
  def change_language(lang, randomize_imgs, randomize_labels):
1053
  # compute text embeddings
1054
  labels = babel_imagenet[lang][1]
 
1064
  text_features = text_features.cpu().numpy()
1065
  else:
1066
  text_features = None
1067
+ correct_text = gr.Text(
1068
+ f"Correct was: ''. Question 1/{len(babel_imagenet[lang][0])} ", label="Game"
1069
+ )
1070
  player_score_text = gr.Text(f"Your choice: (Score: 0) ", label="Player")
1071
  clip_score_text = gr.Text(f"mSigLIP chose: '' (Score: 0)", label="Opponent")
1072
 
1073
+ return (
1074
+ text_features,
1075
+ -1,
1076
+ class_order,
1077
+ correct_text,
1078
+ player_score_text,
1079
+ clip_score_text,
1080
+ 0,
1081
+ 0,
1082
+ )
1083
+
1084
 
1085
  def select(idx, lang, choice, correct, model_choice, player_score, clip_score, choices):
1086
  # checks if answer choice is correct and updated scores
 
1094
  player_score = player_score + int(player_correct)
1095
  clip_score = clip_score + int(model_correct)
1096
 
1097
+ correct_text = gr.Text(
1098
+ f"Correct was: '{correct_name}'. Question {idx+1}/{len(babel_imagenet[lang][0])} ",
1099
+ label="Game",
1100
+ )
1101
+ player_score_text = gr.Text(
1102
+ f"Your choice: {player_choice} {'✅' if player_correct else '❌'} (Score: {player_score}) ",
1103
+ label="Player",
1104
+ )
1105
+ clip_score_text = gr.Text(
1106
+ f"mSigLIP chose: '{model_choice_name}' {'✅' if model_correct else '❌'} (Score: {clip_score})",
1107
+ label="Opponent",
1108
+ )
1109
 
1110
  return correct_text, player_score_text, clip_score_text, player_score, clip_score
1111
 
1112
+
1113
  def prepare(raw_idx, lang, text_embeddings, class_order, randomize_images):
1114
  # prepared next question, loads image, and computes choices
1115
 
1116
+ raw_idx = (raw_idx + 1) % len(babel_imagenet[lang][0])
1117
  idx = class_order[raw_idx]
1118
  lang_class_idxs = babel_imagenet[lang][0]
1119
  class_idx = lang_class_idxs[idx]
 
1127
 
1128
  img_idx = 0
1129
  if randomize_images:
1130
+ img_idx = np.random.choice(
1131
+ min(len(babelnet_images[class_idx]), max_image_choices)
1132
+ )
1133
  img_url = babelnet_images[class_idx][img_idx]["url"]
1134
  class_labels = babel_imagenet[lang][1] if lang != "EN" else openai_en_classes
1135
 
1136
  if not precomputed_results:
1137
  try:
1138
+ image_input = (
1139
+ transform(
1140
+ Image.open(
1141
+ requests.get(img_url, stream=True, headers=request_header).raw
1142
+ ).convert("RGB")
1143
+ )
1144
+ .unsqueeze(0)
1145
+ .to(device)
1146
+ )
1147
  with torch.no_grad():
1148
  image_features = model.encode_image(image_input).float()
1149
  image_features /= image_features.norm(dim=-1, keepdim=True)
1150
  except:
1151
  gr.Warning("There is a problem with the next class. Skipping it.")
1152
+ return prepare(
1153
+ raw_idx, lang, text_embeddings, class_order, randomize_images
1154
+ )
1155
 
1156
  similarity = (text_embeddings @ image_features.cpu().numpy().T).squeeze()
1157
+ choices = np.argsort(similarity)[-4:].tolist()
1158
  else:
1159
+ choices = list(
1160
+ reversed(precomputed_results[lang][idx][img_idx])
1161
+ ) # precomputing script uses torch.topk which sorts in reverse here
1162
  if idx not in choices:
1163
  choices = [idx] + choices[1:]
1164
  model_choice_idx = choices[-1]
 
1166
  numpy.random.shuffle(choices)
1167
 
1168
  choice_names = [class_labels[idx] for idx in choices]
1169
+ choice_values = [0, 1, 2, 3]
1170
 
1171
  model_choice_idx = choices.index(model_choice_idx)
1172
  model_choice = [choice_names[model_choice_idx], choice_values[model_choice_idx]]
1173
  correct_choice_idx = choices.index(idx)
1174
+ correct_choice = [
1175
+ choice_names[correct_choice_idx],
1176
+ choice_values[correct_choice_idx],
1177
+ ]
1178
 
1179
  choice_values = list(zip(choice_names, choice_values))
1180
 
1181
+ next_radio = gr.Radio(
1182
+ choices=choice_values,
1183
+ interactive=True,
1184
+ label="Select the correct answer:",
1185
+ value=None,
1186
+ )
1187
+ next_image = gr.Image(
1188
+ value=img_url,
1189
+ width=IMG_WIDTH,
1190
+ height=IMG_WIDTH,
1191
+ label="What class does this image belong to?",
1192
+ )
1193
 
1194
  return next_radio, next_image, raw_idx, correct_choice, model_choice, choice_values
1195
 
 
1203
 
1204
  img_idx = 0
1205
  if randomize_images:
1206
+ img_idx = np.random.choice(
1207
+ min(len(babelnet_images[class_idx]), max_image_choices)
1208
+ )
1209
  img_url = babelnet_images[class_idx][img_idx]["url"]
1210
  class_labels = babel_imagenet[lang][1] if lang != "EN" else openai_en_classes
1211
 
1212
  if not precomputed_results:
1213
  try:
1214
+ image_input = (
1215
+ transform(
1216
+ Image.open(
1217
+ requests.get(img_url, stream=True, headers=request_header).raw
1218
+ ).convert("RGB")
1219
+ )
1220
+ .unsqueeze(0)
1221
+ .to(device)
1222
+ )
1223
  with torch.no_grad():
1224
  image_features = model.encode_image(image_input).float()
1225
  image_features /= image_features.norm(dim=-1, keepdim=True)
1226
  except:
1227
  gr.Warning("There is a problem with the next class. Skipping it.")
1228
+ return prepare(
1229
+ raw_idx, lang, text_embeddings, class_order, randomize_images
1230
+ )
1231
 
1232
  similarity = (text_embeddings @ image_features.cpu().numpy().T).squeeze()
1233
+ choices = np.argsort(similarity)[-4:].tolist()
1234
  else:
1235
+ choices = list(
1236
+ reversed(precomputed_results[lang][idx][img_idx])
1237
+ ) # precomputing script uses torch.topk which sorts in reverse here
1238
  if idx not in choices:
1239
  choices = [idx] + choices[1:]
1240
  model_choice_idx = choices[-1]
 
1242
  numpy.random.shuffle(choices)
1243
 
1244
  choice_names = [class_labels[idx] for idx in choices]
1245
+ choice_values = [0, 1, 2, 3]
1246
 
1247
  model_choice_idx = choices.index(model_choice_idx)
1248
  model_choice = [choice_names[model_choice_idx], choice_values[model_choice_idx]]
1249
  correct_choice_idx = choices.index(idx)
1250
+ correct_choice = [
1251
+ choice_names[correct_choice_idx],
1252
+ choice_values[correct_choice_idx],
1253
+ ]
1254
 
1255
  choice_values = list(zip(choice_names, choice_values))
1256
 
1257
+ next_radio = gr.Radio(
1258
+ choices=choice_values,
1259
+ interactive=True,
1260
+ label="Select the correct answer:",
1261
+ value=None,
1262
+ )
1263
+ next_image = gr.Image(
1264
+ value=img_url,
1265
+ width=IMG_WIDTH,
1266
+ height=IMG_WIDTH,
1267
+ label="What class does this image belong to?",
1268
+ )
1269
 
1270
  return next_radio, next_image, raw_idx, correct_choice, model_choice, choice_values
1271
 
1272
 
1273
+ with gr.Blocks(title="Babel-ImageNet Quiz") as demo:
1274
 
1275
  # setup state
1276
  class_idx = gr.State(-1)
 
1280
  choices = gr.State([])
1281
 
1282
  text_embeddings = gr.State(None)
1283
+ correct_choice = gr.State(["nan", 0]) # 0, 1, 2, 3
1284
  model_choice = gr.State(["nan", 0])
1285
 
1286
  # Title Area
1287
+ gr.Markdown(
1288
+ """
1289
  # Are you smarter🤓 than CLIP🤖? Take the [ Babel-ImageNet ](https://arxiv.org/abs/2306.08658) Quiz!
1290
 
1291
  <small>by Gregor Geigle, WüNLP & Computer Vision Lab, University of Würzburg</small>
 
1303
  <p><b>'Who is my opponent?'</b> Your opponent CLIP model is [mSigLIP](https://huggingface.co/timm/ViT-B-16-SigLIP-i18n-256), a powerful but small multilingual model with only 370M parameters.</p>
1304
  <p><b>'My game crashed/ I got an error!'</b> This usually happens because of problems with the image URLs. You can try the button to reroll the image or start a new round by clicking the 'Start' button again.</p>
1305
  </details>
1306
+ """
1307
+ )
1308
  with gr.Row():
1309
  with gr.Column(scale=1):
1310
+ gr.Markdown(
1311
+ """
1312
  <details>
1313
  <summary> <b>What is CLIP? </b> (click me to read)</summary>
1314
  <p>
 
1319
  Your opponent CLIP model [mSigLIP](https://arxiv.org/abs/2303.15343) in this quiz does 'zero-shot image classification': We encode all possible class labels and the image and we check which class is most similar; this is then the class chosen by CLIP.
1320
  </p>
1321
  </details>
1322
+ """
1323
+ )
1324
  with gr.Column(scale=1):
1325
+ gr.Markdown(
1326
+ """
1327
  <details>
1328
  <summary> <b>What is ImageNet? </b> (click me to read)</summary>
1329
  <p>
 
1331
  It is a very popular dataset used to benchmark CLIP models because strong results here usually indicates that the image model is overall usefull for many tasks.
1332
  </p>
1333
  </details>
1334
+ """
1335
+ )
1336
  with gr.Column(scale=1):
1337
+ gr.Markdown(
1338
+ """
1339
  <details>
1340
  <summary> <b>What is Babel-ImageNet? </b> (click me to read)</summary>
1341
  <p>
 
1348
  For more details, please read our <a href='https://arxiv.org/abs/2306.08658'>paper.</a>
1349
  </p>
1350
  </details>
1351
+ """
1352
+ )
1353
  # language select dropdown
1354
  with gr.Row():
1355
+ # language_select = gr.Dropdown(
1356
+ # choices=main_language_values,
1357
+ # value="EN",
1358
+ # interactive=True,
1359
+ # label="Select your language:",
1360
+ # )
1361
+ randomize_classes = gr.Checkbox(
1362
+ label="Randomize class order (or play in canonic order)", value=True
1363
+ )
1364
+ randomize_images = gr.Checkbox(
1365
+ label="Randomize images (if unchecked, will always show the same image). Other images might be less relevant.",
1366
+ value=True,
1367
+ )
1368
  start_btn = gr.Button(value="Start", variant="primary")
1369
 
1370
  # quiz area
1371
  with gr.Row():
1372
  with gr.Column(scale=1):
1373
+ image = gr.Image(
1374
+ value="data/bin_image.png", width=IMG_WIDTH, height=IMG_WIDTH
1375
+ )
1376
 
1377
  with gr.Column(scale=1):
1378
+ options = gr.Radio(
1379
+ choices=["Click", "start", "to", "begin"],
1380
+ interactive=False,
1381
+ label="Please click start to begin.",
1382
+ )
1383
  # with gr.Row():
1384
  correct_text = gr.Text("Please click start to begin.")
1385
  player_score_text = gr.Text(f"Player score: 0")
1386
  clip_score_text = gr.Text(f"mSigLIP score: 0")
1387
  reroll_btn = gr.Button(value="Reroll the image (for bad images or errors)")
1388
 
1389
+ options.select(
1390
+ fn=select,
1391
+ inputs=[
1392
+ class_idx,
1393
+ "EN",
1394
+ options,
1395
+ correct_choice,
1396
+ model_choice,
1397
+ player_score,
1398
+ clip_score,
1399
+ choices,
1400
+ ],
1401
+ outputs=[
1402
+ correct_text,
1403
+ player_score_text,
1404
+ clip_score_text,
1405
+ player_score,
1406
+ clip_score,
1407
+ ],
1408
+ ).then(
1409
+ fn=prepare,
1410
+ inputs=[
1411
+ class_idx,
1412
+ "EN",
1413
+ text_embeddings,
1414
+ class_order,
1415
+ randomize_images,
1416
+ ],
1417
+ outputs=[options, image, class_idx, correct_choice, model_choice, choices],
1418
+ )
1419
 
1420
+ start_btn.click(
1421
+ fn=change_language,
1422
+ inputs=[randomize_images, randomize_classes],
1423
+ outputs=[
1424
+ text_embeddings,
1425
+ class_idx,
1426
+ class_order,
1427
+ correct_text,
1428
+ player_score_text,
1429
+ clip_score_text,
1430
+ player_score,
1431
+ clip_score,
1432
+ ],
1433
+ ).then(
1434
+ fn=prepare,
1435
+ inputs=[
1436
+ class_idx,
1437
+ "EN",
1438
+ text_embeddings,
1439
+ class_order,
1440
+ randomize_images,
1441
+ ],
1442
+ outputs=[options, image, class_idx, correct_choice, model_choice, choices],
1443
+ )
1444
 
1445
+ reroll_btn.click(
1446
+ fn=reroll,
1447
+ inputs=[
1448
+ class_idx,
1449
+ "EN",
1450
+ text_embeddings,
1451
+ class_order,
1452
+ randomize_images,
1453
+ ],
1454
+ outputs=[options, image, class_idx, correct_choice, model_choice, choices],
1455
+ )
 
 
 
 
 
 
 
1456
 
1457
  # initialization
1458
  # demo.load(fn=change_language,
 
1463
  # outputs=[options, image, class_idx, correct_choice, model_choice])
1464
 
1465
 
1466
+ demo.launch()