{ "_name_or_path": "Salesforce/blip-vqa-base", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "birthday", "1": "walking", "2": "talking on phone", "3": "woman", "4": "white and blue", "5": "net", "6": "window", "7": "women", "8": "out", "9": "blue and white", "10": "wine", "11": "small", "12": "dirt", "13": "king", "14": "girl", "15": "9:35", "16": "snowboarding", "17": "full", "18": "shrimp", "19": "rack", "20": "red", "21": "red and yellow", "22": "not there", "23": "necklace", "24": "unknown", "25": "crown", "26": "clear", "27": "bikes", "28": "right", "29": "ground", "30": "4", "31": "skier", "32": "skiing", "33": "7:35", "34": "down", "35": "hat", "36": "skateboard", "37": "tired", "38": "screen", "39": "plain", "40": "name tag", "41": "8", "42": "snowboarder", "43": "cloudy", "44": "zoo", "45": "boy", "46": "style", "47": "0", "48": "wine tasting", "49": "7", "50": "ice cream", "51": "smiling", "52": "blue", "53": "watching", "54": "neon", "55": "beige", "56": "can't tell", "57": "bicycle", "58": "doughnut", "59": "calico", "60": "giraffes", "61": "10", "62": "jeep", "63": "6", "64": "cage", "65": "outside", "66": "tower", "67": "gray", "68": "happy", "69": "chair", "70": "5", "71": "sky", "72": "station", "73": "lanyard", "74": "cat", "75": "solid", "76": "suv", "77": "picnic table", "78": "little girl", "79": "lying down", "80": "plastic", "81": "snow", "82": "queen", "83": "hair", "84": "big ben", "85": "windows", "86": "human", "87": "smile", "88": "park", "89": "in car", "90": "canopy", "91": "on street", "92": "curtain", "93": "green", "94": "low", "95": "white", "96": "person", "97": "orange", "98": "bike rack", "99": "french", "100": "lg", "101": "resting", "102": "backpack", "103": "bus", "104": "roof", "105": "church", "106": "fence", "107": "cup", "108": "cross", "109": "wall", "110": "forest", "111": "ball", "112": "snowboard", "113": "chopsticks", "114": "arrow", "115": "plate", "116": "bricks", "117": "clock", "118": "wedding", "119": "3", "120": "black and white", "121": "don't know", "122": "soccer", "123": "lady", "124": "platform", "125": "train", "126": "brick", "127": "leather", "128": "pink", "129": "giraffe", "130": "camera", "131": "yellow", "132": "purple", "133": "hawaii", "134": "tan", "135": "man", "136": "no", "137": "crossing", "138": "blonde", "139": "gray and black", "140": "africa", "141": "door", "142": "at table", "143": "laying down", "144": "2", "145": "8:35", "146": "table", "147": "curtains", "148": "photographer", "149": "beagle", "150": "talking", "151": "2010", "152": "nothing", "153": "1", "154": "double", "155": "yes", "156": "bedroom", "157": "2013", "158": "street", "159": "sidewalk", "160": "fashion", "161": "tent", "162": "dog", "163": "shade", "164": "shadows", "165": "black", "166": "exit", "167": "red and blue", "168": "donut", "169": "car", "170": "7:45", "171": "many", "172": "soccer ball", "173": "tv", "174": "desert", "175": "large", "176": "monitor", "177": "brown", "178": "stripes", "179": "tabby", "180": "sun", "181": "not sure", "182": "protection", "183": "sleeping", "184": "clock tower", "185": "bicycles", "186": "they aren't", "187": "woods", "188": "2000", "189": "on road", "190": "stand", "191": "shadow", "192": "white and black", "193": "trees", "194": "security", "195": "air", "196": "natural", "197": "shelter", "198": "skateboarding" }, "image_size": 384, "image_text_hidden_size": 256, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 47, "1": 153, "10": 61, "2": 144, "2000": 188, "2010": 151, "2013": 157, "3": 119, "4": 30, "5": 70, "6": 63, "7": 49, "7:35": 33, "7:45": 170, "8": 41, "8:35": 145, "9:35": 15, "africa": 140, "air": 195, "arrow": 114, "at table": 142, "backpack": 102, "ball": 111, "beagle": 149, "bedroom": 156, "beige": 55, "bicycle": 57, "bicycles": 185, "big ben": 84, "bike rack": 98, "bikes": 27, "birthday": 0, "black": 165, "black and white": 120, "blonde": 138, "blue": 52, "blue and white": 9, "boy": 45, "brick": 126, "bricks": 116, "brown": 177, "bus": 103, "cage": 64, "calico": 59, "camera": 130, "can't tell": 56, "canopy": 90, "car": 169, "cat": 74, "chair": 69, "chopsticks": 113, "church": 105, "clear": 26, "clock": 117, "clock tower": 184, "cloudy": 43, "cross": 108, "crossing": 137, "crown": 25, "cup": 107, "curtain": 92, "curtains": 147, "desert": 174, "dirt": 12, "dog": 162, "don't know": 121, "donut": 168, "door": 141, "double": 154, "doughnut": 58, "down": 34, "exit": 166, "fashion": 160, "fence": 106, "forest": 110, "french": 99, "full": 17, "giraffe": 129, "giraffes": 60, "girl": 14, "gray": 67, "gray and black": 139, "green": 93, "ground": 29, "hair": 83, "happy": 68, "hat": 35, "hawaii": 133, "human": 86, "ice cream": 50, "in car": 89, "jeep": 62, "king": 13, "lady": 123, "lanyard": 73, "large": 175, "laying down": 143, "leather": 127, "lg": 100, "little girl": 78, "low": 94, "lying down": 79, "man": 135, "many": 171, "monitor": 176, "name tag": 40, "natural": 196, "necklace": 23, "neon": 54, "net": 5, "no": 136, "not sure": 181, "not there": 22, "nothing": 152, "on road": 189, "on street": 91, "orange": 97, "out": 8, "outside": 65, "park": 88, "person": 96, "photographer": 148, "picnic table": 77, "pink": 128, "plain": 39, "plastic": 80, "plate": 115, "platform": 124, "protection": 182, "purple": 132, "queen": 82, "rack": 19, "red": 20, "red and blue": 167, "red and yellow": 21, "resting": 101, "right": 28, "roof": 104, "screen": 38, "security": 194, "shade": 163, "shadow": 191, "shadows": 164, "shelter": 197, "shrimp": 18, "sidewalk": 159, "skateboard": 36, "skateboarding": 198, "skier": 31, "skiing": 32, "sky": 71, "sleeping": 183, "small": 11, "smile": 87, "smiling": 51, "snow": 81, "snowboard": 112, "snowboarder": 42, "snowboarding": 16, "soccer": 122, "soccer ball": 172, "solid": 75, "stand": 190, "station": 72, "street": 158, "stripes": 178, "style": 46, "sun": 180, "suv": 76, "tabby": 179, "table": 146, "talking": 150, "talking on phone": 2, "tan": 134, "tent": 161, "they aren't": 186, "tired": 37, "tower": 66, "train": 125, "trees": 193, "tv": 173, "unknown": 24, "walking": 1, "wall": 109, "watching": 53, "wedding": 118, "white": 95, "white and black": 192, "white and blue": 4, "window": 6, "windows": 85, "wine": 10, "wine tasting": 48, "woman": 3, "women": 7, "woods": 187, "yellow": 131, "yes": 155, "zoo": 44 }, "layer_norm_eps": 1e-12, "logit_scale_init_value": 2.6592, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "projection_dim": 512, "qkv_bias": true, "text_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_probs_dropout_prob": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": 30522, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": 2, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 3072, "is_decoder": true, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-12, "length_penalty": 1.0, "max_length": 20, "max_position_embeddings": 512, "min_length": 0, "model_type": "blip_text_model", "no_repeat_ngram_size": 0, "num_attention_heads": 12, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 12, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": 0, "prefix": null, "problem_type": null, "projection_dim": 768, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": 102, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": null, "torchscript": false, "transformers_version": "4.26.0.dev0", "typical_p": 1.0, "use_bfloat16": false, "use_cache": true, "vocab_size": 30524 }, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.42.4", "type_vocab_size": 2, "vision_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dropout": 0.0, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_size": 768, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_size": 384, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 3072, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-05, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "blip_vision_model", "no_repeat_ngram_size": 0, "num_attention_heads": 12, "num_beam_groups": 1, "num_beams": 1, "num_channels": 3, "num_hidden_layers": 12, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 16, "prefix": null, "problem_type": null, "projection_dim": 512, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": null, "torchscript": false, "transformers_version": "4.26.0.dev0", "typical_p": 1.0, "use_bfloat16": false }, "vocab_size": 30522 }