{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "5", "1": "cloudy", "2": "park", "3": "cross", "4": "blue and white", "5": "2013", "6": "snowboarding", "7": "outside", "8": "camera", "9": "giraffe", "10": "station", "11": "watching", "12": "out", "13": "shade", "14": "gray", "15": "dirt", "16": "stripes", "17": "8:35", "18": "not sure", "19": "9:35", "20": "unknown", "21": "no", "22": "doughnut", "23": "orange", "24": "on road", "25": "plain", "26": "curtains", "27": "man", "28": "girl", "29": "shadows", "30": "2010", "31": "bike rack", "32": "white and black", "33": "windows", "34": "purple", "35": "wedding", "36": "talking", "37": "snowboard", "38": "natural", "39": "wine tasting", "40": "snowboarder", "41": "king", "42": "tired", "43": "crossing", "44": "picnic table", "45": "bedroom", "46": "wine", "47": "pink", "48": "fence", "49": "brown", "50": "resting", "51": "bikes", "52": "bicycle", "53": "1", "54": "air", "55": "lg", "56": "backpack", "57": "skateboarding", "58": "chopsticks", "59": "little girl", "60": "train", "61": "sun", "62": "person", "63": "woman", "64": "screen", "65": "tent", "66": "hawaii", "67": "shelter", "68": "6", "69": "shrimp", "70": "skateboard", "71": "clock tower", "72": "car", "73": "giraffes", "74": "lady", "75": "many", "76": "2000", "77": "exit", "78": "solid", "79": "skier", "80": "full", "81": "tower", "82": "africa", "83": "yellow", "84": "soccer ball", "85": "leather", "86": "clear", "87": "blue", "88": "2", "89": "double", "90": "skiing", "91": "gray and black", "92": "dog", "93": "3", "94": "brick", "95": "10", "96": "lying down", "97": "red and blue", "98": "roof", "99": "beige", "100": "women", "101": "ice cream", "102": "green", "103": "queen", "104": "stand", "105": "walking", "106": "suv", "107": "bicycles", "108": "birthday", "109": "small", "110": "french", "111": "door", "112": "security", "113": "trees", "114": "7:45", "115": "zoo", "116": "cat", "117": "blonde", "118": "not there", "119": "protection", "120": "at table", "121": "happy", "122": "smile", "123": "hat", "124": "yes", "125": "8", "126": "sleeping", "127": "church", "128": "woods", "129": "0", "130": "cup", "131": "table", "132": "cage", "133": "ball", "134": "wall", "135": "street", "136": "low", "137": "on street", "138": "jeep", "139": "smiling", "140": "sidewalk", "141": "plastic", "142": "nothing", "143": "neon", "144": "tabby", "145": "rack", "146": "hair", "147": "donut", "148": "7:35", "149": "black and white", "150": "name tag", "151": "7", "152": "style", "153": "photographer", "154": "big ben", "155": "plate", "156": "monitor", "157": "bus", "158": "tv", "159": "shadow", "160": "platform", "161": "chair", "162": "red", "163": "calico", "164": "don't know", "165": "black", "166": "ground", "167": "down", "168": "arrow", "169": "laying down", "170": "in car", "171": "canopy", "172": "beagle", "173": "right", "174": "window", "175": "talking on phone", "176": "net", "177": "they aren't", "178": "white", "179": "sky", "180": "4", "181": "fashion", "182": "necklace", "183": "crown", "184": "soccer", "185": "white and blue", "186": "snow", "187": "large", "188": "boy", "189": "tan", "190": "human", "191": "curtain", "192": "forest", "193": "clock", "194": "bricks", "195": "lanyard", "196": "can't tell", "197": "desert", "198": "red and yellow" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 129, "1": 53, "10": 95, "2": 88, "2000": 76, "2010": 30, "2013": 5, "3": 93, "4": 180, "5": 0, "6": 68, "7": 151, "7:35": 148, "7:45": 114, "8": 125, "8:35": 17, "9:35": 19, "africa": 82, "air": 54, "arrow": 168, "at table": 120, "backpack": 56, "ball": 133, "beagle": 172, "bedroom": 45, "beige": 99, "bicycle": 52, "bicycles": 107, "big ben": 154, "bike rack": 31, "bikes": 51, "birthday": 108, "black": 165, "black and white": 149, "blonde": 117, "blue": 87, "blue and white": 4, "boy": 188, "brick": 94, "bricks": 194, "brown": 49, "bus": 157, "cage": 132, "calico": 163, "camera": 8, "can't tell": 196, "canopy": 171, "car": 72, "cat": 116, "chair": 161, "chopsticks": 58, "church": 127, "clear": 86, "clock": 193, "clock tower": 71, "cloudy": 1, "cross": 3, "crossing": 43, "crown": 183, "cup": 130, "curtain": 191, "curtains": 26, "desert": 197, "dirt": 15, "dog": 92, "don't know": 164, "donut": 147, "door": 111, "double": 89, "doughnut": 22, "down": 167, "exit": 77, "fashion": 181, "fence": 48, "forest": 192, "french": 110, "full": 80, "giraffe": 9, "giraffes": 73, "girl": 28, "gray": 14, "gray and black": 91, "green": 102, "ground": 166, "hair": 146, "happy": 121, "hat": 123, "hawaii": 66, "human": 190, "ice cream": 101, "in car": 170, "jeep": 138, "king": 41, "lady": 74, "lanyard": 195, "large": 187, "laying down": 169, "leather": 85, "lg": 55, "little girl": 59, "low": 136, "lying down": 96, "man": 27, "many": 75, "monitor": 156, "name tag": 150, "natural": 38, "necklace": 182, "neon": 143, "net": 176, "no": 21, "not sure": 18, "not there": 118, "nothing": 142, "on road": 24, "on street": 137, "orange": 23, "out": 12, "outside": 7, "park": 2, "person": 62, "photographer": 153, "picnic table": 44, "pink": 47, "plain": 25, "plastic": 141, "plate": 155, "platform": 160, "protection": 119, "purple": 34, "queen": 103, "rack": 145, "red": 162, "red and blue": 97, "red and yellow": 198, "resting": 50, "right": 173, "roof": 98, "screen": 64, "security": 112, "shade": 13, "shadow": 159, "shadows": 29, "shelter": 67, "shrimp": 69, "sidewalk": 140, "skateboard": 70, "skateboarding": 57, "skier": 79, "skiing": 90, "sky": 179, "sleeping": 126, "small": 109, "smile": 122, "smiling": 139, "snow": 186, "snowboard": 37, "snowboarder": 40, "snowboarding": 6, "soccer": 184, "soccer ball": 84, "solid": 78, "stand": 104, "station": 10, "street": 135, "stripes": 16, "style": 152, "sun": 61, "suv": 106, "tabby": 144, "table": 131, "talking": 36, "talking on phone": 175, "tan": 189, "tent": 65, "they aren't": 177, "tired": 42, "tower": 81, "train": 60, "trees": 113, "tv": 158, "unknown": 20, "walking": 105, "wall": 134, "watching": 11, "wedding": 35, "white": 178, "white and black": 32, "white and blue": 185, "window": 174, "windows": 33, "wine": 46, "wine tasting": 39, "woman": 63, "women": 100, "woods": 128, "yellow": 83, "yes": 124, "zoo": 115 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.33.1", "type_vocab_size": 2, "vocab_size": 30522 }