JUNJIE99 commited on
Commit
e136115
·
verified ·
1 Parent(s): 860832d

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "logit_scale_init_value": 2.6592,
7
+ "model_type": "clip",
8
+ "projection_dim": 512,
9
+ "auto_map": {
10
+ "AutoModel": "modeling_MMRet_CLIP.CLIPModel"
11
+ },
12
+ "text_config": {
13
+ "_name_or_path": "",
14
+ "add_cross_attention": false,
15
+ "architectures": null,
16
+ "attention_dropout": 0.0,
17
+ "bad_words_ids": null,
18
+ "bos_token_id": 0,
19
+ "chunk_size_feed_forward": 0,
20
+ "decoder_start_token_id": null,
21
+ "diversity_penalty": 0.0,
22
+ "do_sample": false,
23
+ "dropout": 0.0,
24
+ "early_stopping": false,
25
+ "encoder_no_repeat_ngram_size": 0,
26
+ "eos_token_id": 2,
27
+ "finetuning_task": null,
28
+ "forced_bos_token_id": null,
29
+ "forced_eos_token_id": null,
30
+ "hidden_act": "quick_gelu",
31
+ "hidden_size": 512,
32
+ "id2label": {
33
+ "0": "LABEL_0",
34
+ "1": "LABEL_1"
35
+ },
36
+ "initializer_factor": 1.0,
37
+ "initializer_range": 0.02,
38
+ "intermediate_size": 2048,
39
+ "is_decoder": false,
40
+ "is_encoder_decoder": false,
41
+ "label2id": {
42
+ "LABEL_0": 0,
43
+ "LABEL_1": 1
44
+ },
45
+ "layer_norm_eps": 1e-05,
46
+ "length_penalty": 1.0,
47
+ "max_length": 20,
48
+ "max_position_embeddings": 77,
49
+ "min_length": 0,
50
+ "model_type": "clip_text_model",
51
+ "no_repeat_ngram_size": 0,
52
+ "num_attention_heads": 8,
53
+ "num_beam_groups": 1,
54
+ "num_beams": 1,
55
+ "num_hidden_layers": 12,
56
+ "num_return_sequences": 1,
57
+ "output_attentions": false,
58
+ "output_hidden_states": false,
59
+ "output_scores": false,
60
+ "pad_token_id": 1,
61
+ "prefix": null,
62
+ "problem_type": null,
63
+ "projection_dim" : 512,
64
+ "pruned_heads": {},
65
+ "remove_invalid_values": false,
66
+ "repetition_penalty": 1.0,
67
+ "return_dict": true,
68
+ "return_dict_in_generate": false,
69
+ "sep_token_id": null,
70
+ "task_specific_params": null,
71
+ "temperature": 1.0,
72
+ "tie_encoder_decoder": false,
73
+ "tie_word_embeddings": true,
74
+ "tokenizer_class": null,
75
+ "top_k": 50,
76
+ "top_p": 1.0,
77
+ "torch_dtype": null,
78
+ "torchscript": false,
79
+ "transformers_version": "4.12.0.dev0",
80
+ "use_bfloat16": false,
81
+ "vocab_size": 49408
82
+ },
83
+ "text_config_dict": null,
84
+ "torch_dtype": "bfloat16",
85
+ "transformers_version": null,
86
+ "vision_config": {
87
+ "_name_or_path": "",
88
+ "add_cross_attention": false,
89
+ "architectures": null,
90
+ "attention_dropout": 0.0,
91
+ "bad_words_ids": null,
92
+ "bos_token_id": null,
93
+ "chunk_size_feed_forward": 0,
94
+ "decoder_start_token_id": null,
95
+ "diversity_penalty": 0.0,
96
+ "do_sample": false,
97
+ "dropout": 0.0,
98
+ "early_stopping": false,
99
+ "encoder_no_repeat_ngram_size": 0,
100
+ "eos_token_id": null,
101
+ "finetuning_task": null,
102
+ "forced_bos_token_id": null,
103
+ "forced_eos_token_id": null,
104
+ "hidden_act": "quick_gelu",
105
+ "hidden_size": 768,
106
+ "id2label": {
107
+ "0": "LABEL_0",
108
+ "1": "LABEL_1"
109
+ },
110
+ "image_size": 224,
111
+ "initializer_factor": 1.0,
112
+ "initializer_range": 0.02,
113
+ "intermediate_size": 3072,
114
+ "is_decoder": false,
115
+ "is_encoder_decoder": false,
116
+ "label2id": {
117
+ "LABEL_0": 0,
118
+ "LABEL_1": 1
119
+ },
120
+ "layer_norm_eps": 1e-05,
121
+ "length_penalty": 1.0,
122
+ "max_length": 20,
123
+ "min_length": 0,
124
+ "model_type": "clip_vision_model",
125
+ "no_repeat_ngram_size": 0,
126
+ "num_attention_heads": 12,
127
+ "num_beam_groups": 1,
128
+ "num_beams": 1,
129
+ "num_hidden_layers": 12,
130
+ "num_return_sequences": 1,
131
+ "output_attentions": false,
132
+ "output_hidden_states": false,
133
+ "output_scores": false,
134
+ "pad_token_id": null,
135
+ "patch_size": 16,
136
+ "prefix": null,
137
+ "problem_type": null,
138
+ "projection_dim" : 512,
139
+ "pruned_heads": {},
140
+ "remove_invalid_values": false,
141
+ "repetition_penalty": 1.0,
142
+ "return_dict": true,
143
+ "return_dict_in_generate": false,
144
+ "sep_token_id": null,
145
+ "task_specific_params": null,
146
+ "temperature": 1.0,
147
+ "tie_encoder_decoder": false,
148
+ "tie_word_embeddings": true,
149
+ "tokenizer_class": null,
150
+ "top_k": 50,
151
+ "top_p": 1.0,
152
+ "torch_dtype": null,
153
+ "torchscript": false,
154
+ "transformers_version": "4.12.0.dev0",
155
+ "use_bfloat16": false
156
+ },
157
+ "vision_config_dict": {
158
+ "patch_size": 16
159
+ }
160
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300ab945304bfa6d6e26046db1867815d326d7156c019fb39ba725472bc6c846
3
+ size 299289098
modeling_MMRet_CLIP.py ADDED
@@ -0,0 +1,1676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """PyTorch CLIP model."""
16
+
17
+ from dataclasses import dataclass
18
+ from typing import Any, Optional, Tuple, Union
19
+
20
+ import torch
21
+ import torch.utils.checkpoint
22
+ from torch import nn
23
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
24
+ from PIL import Image
25
+ from ...activations import ACT2FN
26
+ from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
27
+ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
28
+ from ...modeling_utils import PreTrainedModel
29
+ from ...pytorch_utils import is_torch_greater_or_equal_than_2_2
30
+ from ...utils import (
31
+ ModelOutput,
32
+ add_code_sample_docstrings,
33
+ add_start_docstrings,
34
+ add_start_docstrings_to_model_forward,
35
+ is_flash_attn_2_available,
36
+ is_flash_attn_greater_or_equal_2_10,
37
+ logging,
38
+ replace_return_docstrings,
39
+ )
40
+ from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
41
+
42
+
43
+ if is_flash_attn_2_available():
44
+ from ...modeling_flash_attention_utils import _flash_attention_forward
45
+
46
+
47
+ logger = logging.get_logger(__name__)
48
+
49
+ # General docstring
50
+ _CONFIG_FOR_DOC = "CLIPConfig"
51
+ _CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"
52
+
53
+ # Image classification docstring
54
+ _IMAGE_CLASS_CHECKPOINT = "openai/clip-vit-base-patch32"
55
+ _IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_0"
56
+
57
+
58
+ # contrastive loss function, adapted from
59
+ # https://sachinruk.github.io/blog/2021-03-07-clip.html
60
+ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
61
+ return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
62
+
63
+
64
+ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
65
+ caption_loss = contrastive_loss(similarity)
66
+ image_loss = contrastive_loss(similarity.t())
67
+ return (caption_loss + image_loss) / 2.0
68
+
69
+
70
+ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
71
+ """
72
+ This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
73
+ model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
74
+ """
75
+ square_tensor = torch.pow(tensor, 2)
76
+ sum_tensor = torch.sum(square_tensor, dim=-1, keepdim=True)
77
+ normed_tensor = torch.pow(sum_tensor, 0.5)
78
+ return normed_tensor
79
+
80
+
81
+ @dataclass
82
+ class CLIPVisionModelOutput(ModelOutput):
83
+ """
84
+ Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
85
+
86
+ Args:
87
+ image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
88
+ The image embeddings obtained by applying the projection layer to the pooler_output.
89
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
90
+ Sequence of hidden-states at the output of the last layer of the model.
91
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
92
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
93
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
94
+
95
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
96
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
97
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
98
+ sequence_length)`.
99
+
100
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
101
+ heads.
102
+ """
103
+
104
+ image_embeds: Optional[torch.FloatTensor] = None
105
+ last_hidden_state: torch.FloatTensor = None
106
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
107
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
108
+
109
+
110
+ @dataclass
111
+ class CLIPTextModelOutput(ModelOutput):
112
+ """
113
+ Base class for text model's outputs that also contains a pooling of the last hidden states.
114
+
115
+ Args:
116
+ text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
117
+ The text embeddings obtained by applying the projection layer to the pooler_output.
118
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
119
+ Sequence of hidden-states at the output of the last layer of the model.
120
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
121
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
122
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
123
+
124
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
125
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
126
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
127
+ sequence_length)`.
128
+
129
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
130
+ heads.
131
+ """
132
+
133
+ text_embeds: Optional[torch.FloatTensor] = None
134
+ last_hidden_state: torch.FloatTensor = None
135
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
136
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
137
+
138
+
139
+ @dataclass
140
+ class CLIPOutput(ModelOutput):
141
+ """
142
+ Args:
143
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
144
+ Contrastive loss for image-text similarity.
145
+ logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
146
+ The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
147
+ similarity scores.
148
+ logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
149
+ The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
150
+ similarity scores.
151
+ text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
152
+ The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
153
+ image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
154
+ The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
155
+ text_model_output (`BaseModelOutputWithPooling`):
156
+ The output of the [`CLIPTextModel`].
157
+ vision_model_output (`BaseModelOutputWithPooling`):
158
+ The output of the [`CLIPVisionModel`].
159
+ """
160
+
161
+ loss: Optional[torch.FloatTensor] = None
162
+ logits_per_image: torch.FloatTensor = None
163
+ logits_per_text: torch.FloatTensor = None
164
+ text_embeds: torch.FloatTensor = None
165
+ image_embeds: torch.FloatTensor = None
166
+ text_model_output: BaseModelOutputWithPooling = None
167
+ vision_model_output: BaseModelOutputWithPooling = None
168
+
169
+ def to_tuple(self) -> Tuple[Any]:
170
+ return tuple(
171
+ self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
172
+ for k in self.keys()
173
+ )
174
+
175
+
176
+ class CLIPVisionEmbeddings(nn.Module):
177
+ def __init__(self, config: CLIPVisionConfig):
178
+ super().__init__()
179
+ self.config = config
180
+ self.embed_dim = config.hidden_size
181
+ self.image_size = config.image_size
182
+ self.patch_size = config.patch_size
183
+
184
+ self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
185
+
186
+ self.patch_embedding = nn.Conv2d(
187
+ in_channels=config.num_channels,
188
+ out_channels=self.embed_dim,
189
+ kernel_size=self.patch_size,
190
+ stride=self.patch_size,
191
+ bias=False,
192
+ )
193
+
194
+ self.num_patches = (self.image_size // self.patch_size) ** 2
195
+ self.num_positions = self.num_patches + 1
196
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
197
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
198
+
199
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
200
+ batch_size = pixel_values.shape[0]
201
+ target_dtype = self.patch_embedding.weight.dtype
202
+ patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
203
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
204
+
205
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
206
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
207
+ embeddings = embeddings + self.position_embedding(self.position_ids)
208
+ return embeddings
209
+
210
+
211
+ class CLIPTextEmbeddings(nn.Module):
212
+ def __init__(self, config: CLIPTextConfig):
213
+ super().__init__()
214
+ embed_dim = config.hidden_size
215
+
216
+ self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
217
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
218
+
219
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
220
+ self.register_buffer(
221
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
222
+ )
223
+
224
+ def forward(
225
+ self,
226
+ input_ids: Optional[torch.LongTensor] = None,
227
+ position_ids: Optional[torch.LongTensor] = None,
228
+ inputs_embeds: Optional[torch.FloatTensor] = None,
229
+ ) -> torch.Tensor:
230
+ seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
231
+
232
+ if position_ids is None:
233
+ position_ids = self.position_ids[:, :seq_length]
234
+
235
+ if inputs_embeds is None:
236
+ inputs_embeds = self.token_embedding(input_ids)
237
+
238
+ position_embeddings = self.position_embedding(position_ids)
239
+ embeddings = inputs_embeds + position_embeddings
240
+
241
+ return embeddings
242
+
243
+
244
+ class CLIPAttention(nn.Module):
245
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
246
+
247
+ def __init__(self, config):
248
+ super().__init__()
249
+ self.config = config
250
+ self.embed_dim = config.hidden_size
251
+ self.num_heads = config.num_attention_heads
252
+ self.head_dim = self.embed_dim // self.num_heads
253
+ if self.head_dim * self.num_heads != self.embed_dim:
254
+ raise ValueError(
255
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
256
+ f" {self.num_heads})."
257
+ )
258
+ self.scale = self.head_dim**-0.5
259
+ self.dropout = config.attention_dropout
260
+
261
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
262
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
263
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
264
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
265
+
266
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
267
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
268
+
269
+ def forward(
270
+ self,
271
+ hidden_states: torch.Tensor,
272
+ attention_mask: Optional[torch.Tensor] = None,
273
+ causal_attention_mask: Optional[torch.Tensor] = None,
274
+ output_attentions: Optional[bool] = False,
275
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
276
+ """Input shape: Batch x Time x Channel"""
277
+
278
+ bsz, tgt_len, embed_dim = hidden_states.size()
279
+
280
+ # get query proj
281
+ query_states = self.q_proj(hidden_states) * self.scale
282
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
283
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
284
+
285
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
286
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
287
+ key_states = key_states.view(*proj_shape)
288
+ value_states = value_states.view(*proj_shape)
289
+
290
+ src_len = key_states.size(1)
291
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
292
+
293
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
294
+ raise ValueError(
295
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
296
+ f" {attn_weights.size()}"
297
+ )
298
+
299
+ # apply the causal_attention_mask first
300
+ if causal_attention_mask is not None:
301
+ if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
302
+ raise ValueError(
303
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
304
+ f" {causal_attention_mask.size()}"
305
+ )
306
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
307
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
308
+
309
+ if attention_mask is not None:
310
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
311
+ raise ValueError(
312
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
313
+ )
314
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
315
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
316
+
317
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
318
+
319
+ if output_attentions:
320
+ # this operation is a bit akward, but it's required to
321
+ # make sure that attn_weights keeps its gradient.
322
+ # In order to do so, attn_weights have to reshaped
323
+ # twice and have to be reused in the following
324
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
325
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
326
+ else:
327
+ attn_weights_reshaped = None
328
+
329
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
330
+
331
+ attn_output = torch.bmm(attn_probs, value_states)
332
+
333
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
334
+ raise ValueError(
335
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
336
+ f" {attn_output.size()}"
337
+ )
338
+
339
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
340
+ attn_output = attn_output.transpose(1, 2)
341
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
342
+
343
+ attn_output = self.out_proj(attn_output)
344
+
345
+ return attn_output, attn_weights_reshaped
346
+
347
+
348
+ class CLIPFlashAttention2(CLIPAttention):
349
+ """
350
+ CLIPAttention flash attention module. This module inherits from `CLIPAttention` as the weights of the module stays
351
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
352
+ flash attention and deal with padding tokens in case the input contains any of them.
353
+ """
354
+
355
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
356
+ def __init__(self, *args, **kwargs):
357
+ super().__init__(*args, **kwargs)
358
+
359
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
360
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
361
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
362
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
363
+
364
+ # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
365
+ def forward(
366
+ self,
367
+ hidden_states: torch.Tensor,
368
+ attention_mask: Optional[torch.Tensor] = None,
369
+ causal_attention_mask: Optional[torch.Tensor] = None,
370
+ output_attentions: Optional[bool] = False,
371
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
372
+ output_attentions = False
373
+
374
+ batch_size, q_len, _ = hidden_states.size()
375
+
376
+ query_states = self.q_proj(hidden_states)
377
+ key_states = self.k_proj(hidden_states)
378
+ value_states = self.v_proj(hidden_states)
379
+
380
+ # Flash attention requires the input to have the shape
381
+ # batch_size x seq_length x head_dim x hidden_dim
382
+ # therefore we just need to keep the original shape
383
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim)
384
+ key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim)
385
+ value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim)
386
+
387
+ dropout_rate = self.dropout if self.training else 0.0
388
+
389
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
390
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
391
+ # cast them back in the correct dtype just to be sure everything works as expected.
392
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
393
+ # in fp32.
394
+
395
+ input_dtype = query_states.dtype
396
+ if input_dtype == torch.float32:
397
+ if torch.is_autocast_enabled():
398
+ target_dtype = torch.get_autocast_gpu_dtype()
399
+ # Handle the case where the model is quantized
400
+ elif hasattr(self.config, "_pre_quantization_dtype"):
401
+ target_dtype = self.config._pre_quantization_dtype
402
+ else:
403
+ target_dtype = self.q_proj.weight.dtype
404
+
405
+ logger.warning_once(
406
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
407
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
408
+ f" {target_dtype}."
409
+ )
410
+
411
+ query_states = query_states.to(target_dtype)
412
+ key_states = key_states.to(target_dtype)
413
+ value_states = value_states.to(target_dtype)
414
+
415
+ attn_output = _flash_attention_forward(
416
+ query_states,
417
+ key_states,
418
+ value_states,
419
+ attention_mask,
420
+ q_len,
421
+ dropout=dropout_rate,
422
+ is_causal=causal_attention_mask is not None,
423
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
424
+ )
425
+
426
+ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
427
+ attn_output = self.out_proj(attn_output)
428
+
429
+ if not output_attentions:
430
+ attn_weights = None
431
+
432
+ return attn_output, attn_weights
433
+
434
+
435
+ class CLIPSdpaAttention(CLIPAttention):
436
+ """
437
+ SDPA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
438
+ `CLIPAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
439
+ SDPA API.
440
+ """
441
+
442
+ # Adapted from CLIPAttention.forward
443
+ def forward(
444
+ self,
445
+ hidden_states: torch.Tensor,
446
+ attention_mask: Optional[torch.Tensor] = None,
447
+ causal_attention_mask: Optional[torch.Tensor] = None,
448
+ output_attentions: Optional[bool] = False,
449
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
450
+ if output_attentions:
451
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
452
+ logger.warning_once(
453
+ "CLIPModel is using CLIPSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not "
454
+ "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying "
455
+ "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can "
456
+ 'be removed using the argument `attn_implementation="eager"` when loading the model.'
457
+ )
458
+ return super().forward(
459
+ hidden_states=hidden_states,
460
+ attention_mask=attention_mask,
461
+ causal_attention_mask=causal_attention_mask,
462
+ output_attentions=output_attentions,
463
+ )
464
+
465
+ # CLIP text model uses both `causal_attention_mask` and `attention_mask`
466
+ if attention_mask is not None and causal_attention_mask is not None:
467
+ attn_mask = attention_mask + causal_attention_mask
468
+ elif causal_attention_mask is not None:
469
+ attn_mask = causal_attention_mask
470
+ else:
471
+ attn_mask = attention_mask
472
+
473
+ bsz, tgt_len, embed_dim = hidden_states.size()
474
+
475
+ query_states = self.q_proj(hidden_states)
476
+ key_states = self.k_proj(hidden_states)
477
+ value_states = self.v_proj(hidden_states)
478
+
479
+ query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
480
+ key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
481
+ value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
482
+
483
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
484
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
485
+ if not is_torch_greater_or_equal_than_2_2 and query_states.device.type == "cuda" and attn_mask is not None:
486
+ query_states = query_states.contiguous()
487
+ key_states = key_states.contiguous()
488
+ value_states = value_states.contiguous()
489
+
490
+ # CLIP text model uses both `causal_attention_mask` and `attention_mask` sequentially.
491
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
492
+ query_states,
493
+ key_states,
494
+ value_states,
495
+ attn_mask=attn_mask,
496
+ dropout_p=self.dropout if self.training else 0.0,
497
+ scale=self.scale,
498
+ )
499
+
500
+ attn_output = attn_output.transpose(1, 2)
501
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
502
+
503
+ attn_output = self.out_proj(attn_output)
504
+
505
+ return attn_output, None
506
+
507
+
508
+ CLIP_ATTENTION_CLASSES = {
509
+ "eager": CLIPAttention,
510
+ "sdpa": CLIPSdpaAttention,
511
+ "flash_attention_2": CLIPFlashAttention2,
512
+ }
513
+
514
+
515
+ class CLIPMLP(nn.Module):
516
+ def __init__(self, config):
517
+ super().__init__()
518
+ self.config = config
519
+ self.activation_fn = ACT2FN[config.hidden_act]
520
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
521
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
522
+
523
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
524
+ hidden_states = self.fc1(hidden_states)
525
+ hidden_states = self.activation_fn(hidden_states)
526
+ hidden_states = self.fc2(hidden_states)
527
+ return hidden_states
528
+
529
+
530
+ class CLIPEncoderLayer(nn.Module):
531
+ def __init__(self, config: CLIPConfig):
532
+ super().__init__()
533
+ self.embed_dim = config.hidden_size
534
+ self.self_attn = CLIP_ATTENTION_CLASSES[config._attn_implementation](config)
535
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
536
+ self.mlp = CLIPMLP(config)
537
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
538
+
539
+ def forward(
540
+ self,
541
+ hidden_states: torch.Tensor,
542
+ attention_mask: torch.Tensor,
543
+ causal_attention_mask: torch.Tensor,
544
+ output_attentions: Optional[bool] = False,
545
+ ) -> Tuple[torch.FloatTensor]:
546
+ """
547
+ Args:
548
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
549
+ attention_mask (`torch.FloatTensor`): attention mask of size
550
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
551
+ `(config.encoder_attention_heads,)`.
552
+ output_attentions (`bool`, *optional*):
553
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
554
+ returned tensors for more detail.
555
+ """
556
+ residual = hidden_states
557
+
558
+ hidden_states = self.layer_norm1(hidden_states)
559
+ hidden_states, attn_weights = self.self_attn(
560
+ hidden_states=hidden_states,
561
+ attention_mask=attention_mask,
562
+ causal_attention_mask=causal_attention_mask,
563
+ output_attentions=output_attentions,
564
+ )
565
+ hidden_states = residual + hidden_states
566
+
567
+ residual = hidden_states
568
+ hidden_states = self.layer_norm2(hidden_states)
569
+ hidden_states = self.mlp(hidden_states)
570
+ hidden_states = residual + hidden_states
571
+
572
+ outputs = (hidden_states,)
573
+
574
+ if output_attentions:
575
+ outputs += (attn_weights,)
576
+
577
+ return outputs
578
+
579
+
580
+ class CLIPPreTrainedModel(PreTrainedModel):
581
+ """
582
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
583
+ models.
584
+ """
585
+
586
+ config_class = CLIPConfig
587
+ base_model_prefix = "clip"
588
+ supports_gradient_checkpointing = True
589
+ _supports_sdpa = True
590
+ _supports_flash_attn_2 = True
591
+
592
+ def _init_weights(self, module):
593
+ """Initialize the weights"""
594
+ factor = self.config.initializer_factor
595
+ if isinstance(module, CLIPTextEmbeddings):
596
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
597
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
598
+ elif isinstance(module, CLIPVisionEmbeddings):
599
+ factor = self.config.initializer_factor
600
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
601
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
602
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
603
+ elif isinstance(module, CLIPAttention):
604
+ factor = self.config.initializer_factor
605
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
606
+ out_proj_std = (module.embed_dim**-0.5) * factor
607
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
608
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
609
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
610
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
611
+ elif isinstance(module, CLIPMLP):
612
+ factor = self.config.initializer_factor
613
+ in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
614
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
615
+ nn.init.normal_(module.fc1.weight, std=fc_std)
616
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
617
+ elif isinstance(module, CLIPModel):
618
+ nn.init.normal_(
619
+ module.text_projection.weight,
620
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
621
+ )
622
+ nn.init.normal_(
623
+ module.visual_projection.weight,
624
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
625
+ )
626
+ elif isinstance(module, CLIPVisionModelWithProjection):
627
+ nn.init.normal_(
628
+ module.visual_projection.weight,
629
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
630
+ )
631
+ elif isinstance(module, CLIPTextModelWithProjection):
632
+ nn.init.normal_(
633
+ module.text_projection.weight,
634
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
635
+ )
636
+ elif isinstance(module, CLIPForImageClassification):
637
+ nn.init.normal_(
638
+ module.classifier.weight,
639
+ std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
640
+ )
641
+
642
+ if isinstance(module, nn.LayerNorm):
643
+ module.bias.data.zero_()
644
+ module.weight.data.fill_(1.0)
645
+ if isinstance(module, nn.Linear) and module.bias is not None:
646
+ module.bias.data.zero_()
647
+
648
+
649
+ CLIP_START_DOCSTRING = r"""
650
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
651
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
652
+ etc.)
653
+
654
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
655
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
656
+ and behavior.
657
+
658
+ Parameters:
659
+ config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
660
+ Initializing with a config file does not load the weights associated with the model, only the
661
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
662
+ """
663
+
664
+ CLIP_TEXT_INPUTS_DOCSTRING = r"""
665
+ Args:
666
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
667
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
668
+ it.
669
+
670
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
671
+ [`PreTrainedTokenizer.__call__`] for details.
672
+
673
+ [What are input IDs?](../glossary#input-ids)
674
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
675
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
676
+
677
+ - 1 for tokens that are **not masked**,
678
+ - 0 for tokens that are **masked**.
679
+
680
+ [What are attention masks?](../glossary#attention-mask)
681
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
682
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
683
+ config.max_position_embeddings - 1]`.
684
+
685
+ [What are position IDs?](../glossary#position-ids)
686
+ output_attentions (`bool`, *optional*):
687
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
688
+ tensors for more detail.
689
+ output_hidden_states (`bool`, *optional*):
690
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
691
+ more detail.
692
+ return_dict (`bool`, *optional*):
693
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
694
+ """
695
+
696
+ CLIP_VISION_INPUTS_DOCSTRING = r"""
697
+ Args:
698
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
699
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
700
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
701
+ output_attentions (`bool`, *optional*):
702
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
703
+ tensors for more detail.
704
+ output_hidden_states (`bool`, *optional*):
705
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
706
+ more detail.
707
+ return_dict (`bool`, *optional*):
708
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
709
+ """
710
+
711
+ CLIP_INPUTS_DOCSTRING = r"""
712
+ Args:
713
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
714
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
715
+ it.
716
+
717
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
718
+ [`PreTrainedTokenizer.__call__`] for details.
719
+
720
+ [What are input IDs?](../glossary#input-ids)
721
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
722
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
723
+
724
+ - 1 for tokens that are **not masked**,
725
+ - 0 for tokens that are **masked**.
726
+
727
+ [What are attention masks?](../glossary#attention-mask)
728
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
729
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
730
+ config.max_position_embeddings - 1]`.
731
+
732
+ [What are position IDs?](../glossary#position-ids)
733
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
734
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
735
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
736
+ return_loss (`bool`, *optional*):
737
+ Whether or not to return the contrastive loss.
738
+ output_attentions (`bool`, *optional*):
739
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
740
+ tensors for more detail.
741
+ output_hidden_states (`bool`, *optional*):
742
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
743
+ more detail.
744
+ return_dict (`bool`, *optional*):
745
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
746
+ """
747
+
748
+
749
+ class CLIPEncoder(nn.Module):
750
+ """
751
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
752
+ [`CLIPEncoderLayer`].
753
+
754
+ Args:
755
+ config: CLIPConfig
756
+ """
757
+
758
+ def __init__(self, config: CLIPConfig):
759
+ super().__init__()
760
+ self.config = config
761
+ self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
762
+ self.gradient_checkpointing = False
763
+
764
+ def forward(
765
+ self,
766
+ inputs_embeds,
767
+ attention_mask: Optional[torch.Tensor] = None,
768
+ causal_attention_mask: Optional[torch.Tensor] = None,
769
+ output_attentions: Optional[bool] = None,
770
+ output_hidden_states: Optional[bool] = None,
771
+ return_dict: Optional[bool] = None,
772
+ ) -> Union[Tuple, BaseModelOutput]:
773
+ r"""
774
+ Args:
775
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
776
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
777
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
778
+ than the model's internal embedding lookup matrix.
779
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
780
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
781
+
782
+ - 1 for tokens that are **not masked**,
783
+ - 0 for tokens that are **masked**.
784
+
785
+ [What are attention masks?](../glossary#attention-mask)
786
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
787
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
788
+
789
+ - 1 for tokens that are **not masked**,
790
+ - 0 for tokens that are **masked**.
791
+
792
+ [What are attention masks?](../glossary#attention-mask)
793
+ output_attentions (`bool`, *optional*):
794
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
795
+ returned tensors for more detail.
796
+ output_hidden_states (`bool`, *optional*):
797
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
798
+ for more detail.
799
+ return_dict (`bool`, *optional*):
800
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
801
+ """
802
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
803
+ output_hidden_states = (
804
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
805
+ )
806
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
807
+
808
+ encoder_states = () if output_hidden_states else None
809
+ all_attentions = () if output_attentions else None
810
+
811
+ hidden_states = inputs_embeds
812
+ for idx, encoder_layer in enumerate(self.layers):
813
+ if output_hidden_states:
814
+ encoder_states = encoder_states + (hidden_states,)
815
+ if self.gradient_checkpointing and self.training:
816
+ layer_outputs = self._gradient_checkpointing_func(
817
+ encoder_layer.__call__,
818
+ hidden_states,
819
+ attention_mask,
820
+ causal_attention_mask,
821
+ output_attentions,
822
+ )
823
+ else:
824
+ layer_outputs = encoder_layer(
825
+ hidden_states,
826
+ attention_mask,
827
+ causal_attention_mask,
828
+ output_attentions=output_attentions,
829
+ )
830
+
831
+ hidden_states = layer_outputs[0]
832
+
833
+ if output_attentions:
834
+ all_attentions = all_attentions + (layer_outputs[1],)
835
+
836
+ if output_hidden_states:
837
+ encoder_states = encoder_states + (hidden_states,)
838
+
839
+ if not return_dict:
840
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
841
+ return BaseModelOutput(
842
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
843
+ )
844
+
845
+
846
+ class CLIPTextTransformer(nn.Module):
847
+ def __init__(self, config: CLIPTextConfig):
848
+ super().__init__()
849
+ self.config = config
850
+ embed_dim = config.hidden_size
851
+ self.embeddings = CLIPTextEmbeddings(config)
852
+ self.encoder = CLIPEncoder(config)
853
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
854
+
855
+ # For `pooled_output` computation
856
+ self.eos_token_id = config.eos_token_id
857
+
858
+ # For attention mask, it differs between `flash_attention_2` and other attention implementations
859
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
860
+
861
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
862
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
863
+ def forward(
864
+ self,
865
+ input_ids: Optional[torch.Tensor] = None,
866
+ attention_mask: Optional[torch.Tensor] = None,
867
+ position_ids: Optional[torch.Tensor] = None,
868
+ output_attentions: Optional[bool] = None,
869
+ output_hidden_states: Optional[bool] = None,
870
+ return_dict: Optional[bool] = None,
871
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
872
+ r"""
873
+ Returns:
874
+
875
+ """
876
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
877
+ output_hidden_states = (
878
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
879
+ )
880
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
881
+
882
+ if input_ids is None:
883
+ raise ValueError("You have to specify input_ids")
884
+
885
+ input_shape = input_ids.size()
886
+ input_ids = input_ids.view(-1, input_shape[-1])
887
+
888
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
889
+
890
+ # CLIP's text model uses causal mask, prepare it here.
891
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
892
+ causal_attention_mask = _create_4d_causal_attention_mask(
893
+ input_shape, hidden_states.dtype, device=hidden_states.device
894
+ )
895
+
896
+ # expand attention_mask
897
+ if attention_mask is not None and not self._use_flash_attention_2:
898
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
899
+ attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
900
+
901
+ encoder_outputs = self.encoder(
902
+ inputs_embeds=hidden_states,
903
+ attention_mask=attention_mask,
904
+ causal_attention_mask=causal_attention_mask,
905
+ output_attentions=output_attentions,
906
+ output_hidden_states=output_hidden_states,
907
+ return_dict=return_dict,
908
+ )
909
+
910
+ last_hidden_state = encoder_outputs[0]
911
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
912
+
913
+ if self.eos_token_id == 2:
914
+ # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
915
+ # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
916
+ # ------------------------------------------------------------
917
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
918
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
919
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
920
+ pooled_output = last_hidden_state[
921
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
922
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
923
+ ]
924
+ else:
925
+ # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
926
+ pooled_output = last_hidden_state[
927
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
928
+ # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
929
+ # Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer)
930
+ (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
931
+ .int()
932
+ .argmax(dim=-1),
933
+ ]
934
+
935
+ if not return_dict:
936
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
937
+
938
+ return BaseModelOutputWithPooling(
939
+ last_hidden_state=last_hidden_state,
940
+ pooler_output=pooled_output,
941
+ hidden_states=encoder_outputs.hidden_states,
942
+ attentions=encoder_outputs.attentions,
943
+ )
944
+
945
+
946
+ @add_start_docstrings(
947
+ """The text model from CLIP without any head or projection on top.""",
948
+ CLIP_START_DOCSTRING,
949
+ )
950
+ class CLIPTextModel(CLIPPreTrainedModel):
951
+ config_class = CLIPTextConfig
952
+
953
+ _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
954
+
955
+ def __init__(self, config: CLIPTextConfig):
956
+ super().__init__(config)
957
+ self.text_model = CLIPTextTransformer(config)
958
+ # Initialize weights and apply final processing
959
+ self.post_init()
960
+
961
+ def get_input_embeddings(self) -> nn.Module:
962
+ return self.text_model.embeddings.token_embedding
963
+
964
+ def set_input_embeddings(self, value):
965
+ self.text_model.embeddings.token_embedding = value
966
+
967
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
968
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
969
+ def forward(
970
+ self,
971
+ input_ids: Optional[torch.Tensor] = None,
972
+ attention_mask: Optional[torch.Tensor] = None,
973
+ position_ids: Optional[torch.Tensor] = None,
974
+ output_attentions: Optional[bool] = None,
975
+ output_hidden_states: Optional[bool] = None,
976
+ return_dict: Optional[bool] = None,
977
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
978
+ r"""
979
+ Returns:
980
+
981
+ Examples:
982
+
983
+ ```python
984
+ >>> from transformers import AutoTokenizer, CLIPTextModel
985
+
986
+ >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
987
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
988
+
989
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
990
+
991
+ >>> outputs = model(**inputs)
992
+ >>> last_hidden_state = outputs.last_hidden_state
993
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
994
+ ```"""
995
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
996
+
997
+ return self.text_model(
998
+ input_ids=input_ids,
999
+ attention_mask=attention_mask,
1000
+ position_ids=position_ids,
1001
+ output_attentions=output_attentions,
1002
+ output_hidden_states=output_hidden_states,
1003
+ return_dict=return_dict,
1004
+ )
1005
+
1006
+
1007
+ class CLIPVisionTransformer(nn.Module):
1008
+ def __init__(self, config: CLIPVisionConfig):
1009
+ super().__init__()
1010
+ self.config = config
1011
+ embed_dim = config.hidden_size
1012
+
1013
+ self.embeddings = CLIPVisionEmbeddings(config)
1014
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
1015
+ self.encoder = CLIPEncoder(config)
1016
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
1017
+
1018
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
1019
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
1020
+ def forward(
1021
+ self,
1022
+ pixel_values: Optional[torch.FloatTensor] = None,
1023
+ output_attentions: Optional[bool] = None,
1024
+ output_hidden_states: Optional[bool] = None,
1025
+ return_dict: Optional[bool] = None,
1026
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
1027
+ r"""
1028
+ Returns:
1029
+
1030
+ """
1031
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1032
+ output_hidden_states = (
1033
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1034
+ )
1035
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1036
+
1037
+ if pixel_values is None:
1038
+ raise ValueError("You have to specify pixel_values")
1039
+
1040
+ hidden_states = self.embeddings(pixel_values)
1041
+ hidden_states = self.pre_layrnorm(hidden_states)
1042
+
1043
+ encoder_outputs = self.encoder(
1044
+ inputs_embeds=hidden_states,
1045
+ output_attentions=output_attentions,
1046
+ output_hidden_states=output_hidden_states,
1047
+ return_dict=return_dict,
1048
+ )
1049
+
1050
+ last_hidden_state = encoder_outputs[0]
1051
+ pooled_output = last_hidden_state[:, 0, :]
1052
+ pooled_output = self.post_layernorm(pooled_output)
1053
+
1054
+ if not return_dict:
1055
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
1056
+
1057
+ return BaseModelOutputWithPooling(
1058
+ last_hidden_state=last_hidden_state,
1059
+ pooler_output=pooled_output,
1060
+ hidden_states=encoder_outputs.hidden_states,
1061
+ attentions=encoder_outputs.attentions,
1062
+ )
1063
+
1064
+
1065
+ @add_start_docstrings(
1066
+ """The vision model from CLIP without any head or projection on top.""",
1067
+ CLIP_START_DOCSTRING,
1068
+ )
1069
+ class CLIPVisionModel(CLIPPreTrainedModel):
1070
+ config_class = CLIPVisionConfig
1071
+ main_input_name = "pixel_values"
1072
+ _no_split_modules = ["CLIPEncoderLayer"]
1073
+
1074
+ def __init__(self, config: CLIPVisionConfig):
1075
+ super().__init__(config)
1076
+ self.vision_model = CLIPVisionTransformer(config)
1077
+ # Initialize weights and apply final processing
1078
+ self.post_init()
1079
+
1080
+ def get_input_embeddings(self) -> nn.Module:
1081
+ return self.vision_model.embeddings.patch_embedding
1082
+
1083
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
1084
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
1085
+ def forward(
1086
+ self,
1087
+ pixel_values: Optional[torch.FloatTensor] = None,
1088
+ output_attentions: Optional[bool] = None,
1089
+ output_hidden_states: Optional[bool] = None,
1090
+ return_dict: Optional[bool] = None,
1091
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
1092
+ r"""
1093
+ Returns:
1094
+
1095
+ Examples:
1096
+
1097
+ ```python
1098
+ >>> from PIL import Image
1099
+ >>> import requests
1100
+ >>> from transformers import AutoProcessor, CLIPVisionModel
1101
+
1102
+ >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
1103
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
1104
+
1105
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1106
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1107
+
1108
+ >>> inputs = processor(images=image, return_tensors="pt")
1109
+
1110
+ >>> outputs = model(**inputs)
1111
+ >>> last_hidden_state = outputs.last_hidden_state
1112
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
1113
+ ```"""
1114
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1115
+
1116
+ return self.vision_model(
1117
+ pixel_values=pixel_values,
1118
+ output_attentions=output_attentions,
1119
+ output_hidden_states=output_hidden_states,
1120
+ return_dict=return_dict,
1121
+ )
1122
+
1123
+
1124
+ @add_start_docstrings(CLIP_START_DOCSTRING)
1125
+ class CLIPModel(CLIPPreTrainedModel):
1126
+ config_class = CLIPConfig
1127
+ _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
1128
+
1129
+ def __init__(self, config: CLIPConfig):
1130
+ super().__init__(config)
1131
+
1132
+ if not isinstance(config.text_config, CLIPTextConfig):
1133
+ raise TypeError(
1134
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
1135
+ f" {type(config.text_config)}."
1136
+ )
1137
+
1138
+ if not isinstance(config.vision_config, CLIPVisionConfig):
1139
+ raise TypeError(
1140
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
1141
+ f" {type(config.vision_config)}."
1142
+ )
1143
+
1144
+ text_config = config.text_config
1145
+ vision_config = config.vision_config
1146
+
1147
+ self.projection_dim = config.projection_dim
1148
+ self.text_embed_dim = text_config.hidden_size
1149
+ self.vision_embed_dim = vision_config.hidden_size
1150
+
1151
+ text_model = CLIPTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
1152
+ self.text_model = text_model.text_model
1153
+
1154
+ vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
1155
+ self.vision_model = vision_model.vision_model
1156
+
1157
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
1158
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
1159
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
1160
+
1161
+ # Initialize weights and apply final processing
1162
+ self.post_init()
1163
+
1164
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
1165
+ def get_text_features(
1166
+ self,
1167
+ input_ids: Optional[torch.Tensor] = None,
1168
+ attention_mask: Optional[torch.Tensor] = None,
1169
+ position_ids: Optional[torch.Tensor] = None,
1170
+ output_attentions: Optional[bool] = None,
1171
+ output_hidden_states: Optional[bool] = None,
1172
+ return_dict: Optional[bool] = None,
1173
+ ) -> torch.FloatTensor:
1174
+ r"""
1175
+ Returns:
1176
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
1177
+ applying the projection layer to the pooled output of [`CLIPTextModel`].
1178
+
1179
+ Examples:
1180
+
1181
+ ```python
1182
+ >>> from transformers import AutoTokenizer, CLIPModel
1183
+
1184
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
1185
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
1186
+
1187
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
1188
+ >>> text_features = model.get_text_features(**inputs)
1189
+ ```"""
1190
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
1191
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1192
+ output_hidden_states = (
1193
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1194
+ )
1195
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1196
+
1197
+ text_outputs = self.text_model(
1198
+ input_ids=input_ids,
1199
+ attention_mask=attention_mask,
1200
+ position_ids=position_ids,
1201
+ output_attentions=output_attentions,
1202
+ output_hidden_states=output_hidden_states,
1203
+ return_dict=return_dict,
1204
+ )
1205
+
1206
+ pooled_output = text_outputs[1]
1207
+ text_features = self.text_projection(pooled_output)
1208
+
1209
+ return text_features
1210
+
1211
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
1212
+ def get_image_features(
1213
+ self,
1214
+ pixel_values: Optional[torch.FloatTensor] = None,
1215
+ output_attentions: Optional[bool] = None,
1216
+ output_hidden_states: Optional[bool] = None,
1217
+ return_dict: Optional[bool] = None,
1218
+ ) -> torch.FloatTensor:
1219
+ r"""
1220
+ Returns:
1221
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
1222
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
1223
+
1224
+ Examples:
1225
+
1226
+ ```python
1227
+ >>> from PIL import Image
1228
+ >>> import requests
1229
+ >>> from transformers import AutoProcessor, CLIPModel
1230
+
1231
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
1232
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
1233
+
1234
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1235
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1236
+
1237
+ >>> inputs = processor(images=image, return_tensors="pt")
1238
+
1239
+ >>> image_features = model.get_image_features(**inputs)
1240
+ ```"""
1241
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
1242
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1243
+ output_hidden_states = (
1244
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1245
+ )
1246
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1247
+
1248
+ vision_outputs = self.vision_model(
1249
+ pixel_values=pixel_values,
1250
+ output_attentions=output_attentions,
1251
+ output_hidden_states=output_hidden_states,
1252
+ return_dict=return_dict,
1253
+ )
1254
+
1255
+ pooled_output = vision_outputs[1] # pooled_output
1256
+ image_features = self.visual_projection(pooled_output)
1257
+
1258
+ return image_features
1259
+
1260
+
1261
+ def encode_image(self, images):
1262
+ embeddings = self.model.get_image_features(images)
1263
+ embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
1264
+ return embeddings
1265
+
1266
+ def encode_text(self, text):
1267
+ embeddings = self.model.get_text_features(**text)
1268
+ embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
1269
+ return embeddings
1270
+
1271
+ def encode_multimodal(self, images, text):
1272
+ text_embeddings = self.model.get_text_features(**text)
1273
+ image_embeddings = self.model.get_image_features(images)
1274
+
1275
+ embeddings = text_embeddings + image_embeddings
1276
+ embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
1277
+
1278
+ return embeddings.contiguous()
1279
+
1280
+ def data_process(self, images=None, text=None):
1281
+ if images is None and text is not None:
1282
+ text = self.processor(text=text, return_tensors="pt", padding=True).to(self.model.device)
1283
+
1284
+ return images, text, "text"
1285
+ elif images is not None and text is None:
1286
+ if isinstance(images, str):
1287
+ images = Image.open(images).convert("RGB")
1288
+ elif isinstance(images, list):
1289
+ images = [Image.open(image).convert("RGB") for image in images]
1290
+ images = self.processor(images=images, return_tensors="pt").to(self.model.device)
1291
+ images = images["pixel_values"]
1292
+ return images, text, "images"
1293
+ elif images is not None and text is not None:
1294
+ assert type(images) == type(text), "images and text must be the same type: list or str"
1295
+ if isinstance(images, str):
1296
+ images = Image.open(images).convert("RGB")
1297
+ elif isinstance(images, list):
1298
+ assert len(images) == len(text), "images and text must be lists of the same length when use list"
1299
+ images = [Image.open(image).convert("RGB") for image in images]
1300
+ images = self.processor(images=images, return_tensors="pt").to(self.model.device)
1301
+ images = images["pixel_values"]
1302
+ text = self.processor(text=text, return_tensors="pt", padding=True).to(self.model.device)
1303
+ return images, text, "multimodal"
1304
+ else:
1305
+ raise ValueError("images and text cannot both be None")
1306
+
1307
+ def encode(self, images=None, text=None):
1308
+ images, text, data_type = self.data_process(images, text)
1309
+ if data_type == "images":
1310
+ return self.encode_image(images)
1311
+ elif data_type == "text":
1312
+ return self.encode_text(text)
1313
+ elif data_type == "multimodal":
1314
+ return self.encode_multimodal(images, text)
1315
+
1316
+
1317
+ @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
1318
+ @replace_return_docstrings(output_type=CLIPOutput, config_class=CLIPConfig)
1319
+ def forward(
1320
+ self,
1321
+ input_ids: Optional[torch.LongTensor] = None,
1322
+ pixel_values: Optional[torch.FloatTensor] = None,
1323
+ attention_mask: Optional[torch.Tensor] = None,
1324
+ position_ids: Optional[torch.LongTensor] = None,
1325
+ return_loss: Optional[bool] = None,
1326
+ output_attentions: Optional[bool] = None,
1327
+ output_hidden_states: Optional[bool] = None,
1328
+ return_dict: Optional[bool] = None,
1329
+ ) -> Union[Tuple, CLIPOutput]:
1330
+ r"""
1331
+ Returns:
1332
+
1333
+ Examples:
1334
+
1335
+ ```python
1336
+ >>> from PIL import Image
1337
+ >>> import requests
1338
+ >>> from transformers import AutoProcessor, CLIPModel
1339
+
1340
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
1341
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
1342
+
1343
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1344
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1345
+
1346
+ >>> inputs = processor(
1347
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
1348
+ ... )
1349
+
1350
+ >>> outputs = model(**inputs)
1351
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
1352
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
1353
+ ```"""
1354
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
1355
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1356
+ output_hidden_states = (
1357
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1358
+ )
1359
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1360
+
1361
+ vision_outputs = self.vision_model(
1362
+ pixel_values=pixel_values,
1363
+ output_attentions=output_attentions,
1364
+ output_hidden_states=output_hidden_states,
1365
+ return_dict=return_dict,
1366
+ )
1367
+
1368
+ text_outputs = self.text_model(
1369
+ input_ids=input_ids,
1370
+ attention_mask=attention_mask,
1371
+ position_ids=position_ids,
1372
+ output_attentions=output_attentions,
1373
+ output_hidden_states=output_hidden_states,
1374
+ return_dict=return_dict,
1375
+ )
1376
+
1377
+ image_embeds = vision_outputs[1]
1378
+ image_embeds = self.visual_projection(image_embeds)
1379
+
1380
+ text_embeds = text_outputs[1]
1381
+ text_embeds = self.text_projection(text_embeds)
1382
+
1383
+ # normalized features
1384
+ image_embeds = image_embeds / _get_vector_norm(image_embeds)
1385
+ text_embeds = text_embeds / _get_vector_norm(text_embeds)
1386
+
1387
+ # cosine similarity as logits
1388
+ logit_scale = self.logit_scale.exp()
1389
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) * logit_scale.to(
1390
+ text_embeds.device
1391
+ )
1392
+ logits_per_image = logits_per_text.t()
1393
+
1394
+ loss = None
1395
+ if return_loss:
1396
+ loss = clip_loss(logits_per_text)
1397
+
1398
+ if not return_dict:
1399
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
1400
+ return ((loss,) + output) if loss is not None else output
1401
+
1402
+ return CLIPOutput(
1403
+ loss=loss,
1404
+ logits_per_image=logits_per_image,
1405
+ logits_per_text=logits_per_text,
1406
+ text_embeds=text_embeds,
1407
+ image_embeds=image_embeds,
1408
+ text_model_output=text_outputs,
1409
+ vision_model_output=vision_outputs,
1410
+ )
1411
+
1412
+
1413
+ @add_start_docstrings(
1414
+ """
1415
+ CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).
1416
+ """,
1417
+ CLIP_START_DOCSTRING,
1418
+ )
1419
+ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
1420
+ config_class = CLIPTextConfig
1421
+
1422
+ _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
1423
+
1424
+ def __init__(self, config: CLIPTextConfig):
1425
+ super().__init__(config)
1426
+
1427
+ text_model = CLIPTextModel._from_config(config, attn_implementation=config._attn_implementation)
1428
+ self.text_model = text_model.text_model
1429
+
1430
+ self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
1431
+
1432
+ # Initialize weights and apply final processing
1433
+ self.post_init()
1434
+
1435
+ def get_input_embeddings(self) -> nn.Module:
1436
+ return self.text_model.embeddings.token_embedding
1437
+
1438
+ def set_input_embeddings(self, value):
1439
+ self.text_model.embeddings.token_embedding = value
1440
+
1441
+ @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
1442
+ @replace_return_docstrings(output_type=CLIPTextModelOutput, config_class=CLIPTextConfig)
1443
+ def forward(
1444
+ self,
1445
+ input_ids: Optional[torch.Tensor] = None,
1446
+ attention_mask: Optional[torch.Tensor] = None,
1447
+ position_ids: Optional[torch.Tensor] = None,
1448
+ output_attentions: Optional[bool] = None,
1449
+ output_hidden_states: Optional[bool] = None,
1450
+ return_dict: Optional[bool] = None,
1451
+ ) -> Union[Tuple, CLIPTextModelOutput]:
1452
+ r"""
1453
+ Returns:
1454
+
1455
+ Examples:
1456
+
1457
+ ```python
1458
+ >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection
1459
+
1460
+ >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
1461
+ >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
1462
+
1463
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
1464
+
1465
+ >>> outputs = model(**inputs)
1466
+ >>> text_embeds = outputs.text_embeds
1467
+ ```"""
1468
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1469
+
1470
+ text_outputs = self.text_model(
1471
+ input_ids=input_ids,
1472
+ attention_mask=attention_mask,
1473
+ position_ids=position_ids,
1474
+ output_attentions=output_attentions,
1475
+ output_hidden_states=output_hidden_states,
1476
+ return_dict=return_dict,
1477
+ )
1478
+
1479
+ pooled_output = text_outputs[1]
1480
+
1481
+ text_embeds = self.text_projection(pooled_output)
1482
+
1483
+ if not return_dict:
1484
+ outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
1485
+ return tuple(output for output in outputs if output is not None)
1486
+
1487
+ return CLIPTextModelOutput(
1488
+ text_embeds=text_embeds,
1489
+ last_hidden_state=text_outputs.last_hidden_state,
1490
+ hidden_states=text_outputs.hidden_states,
1491
+ attentions=text_outputs.attentions,
1492
+ )
1493
+
1494
+
1495
+ @add_start_docstrings(
1496
+ """
1497
+ CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
1498
+ """,
1499
+ CLIP_START_DOCSTRING,
1500
+ )
1501
+ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
1502
+ config_class = CLIPVisionConfig
1503
+ main_input_name = "pixel_values"
1504
+
1505
+ def __init__(self, config: CLIPVisionConfig):
1506
+ super().__init__(config)
1507
+
1508
+ vision_model = CLIPVisionModel._from_config(config, attn_implementation=config._attn_implementation)
1509
+ self.vision_model = vision_model.vision_model
1510
+
1511
+ self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
1512
+
1513
+ # Initialize weights and apply final processing
1514
+ self.post_init()
1515
+
1516
+ def get_input_embeddings(self) -> nn.Module:
1517
+ return self.vision_model.embeddings.patch_embedding
1518
+
1519
+ @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
1520
+ @replace_return_docstrings(output_type=CLIPVisionModelOutput, config_class=CLIPVisionConfig)
1521
+ def forward(
1522
+ self,
1523
+ pixel_values: Optional[torch.FloatTensor] = None,
1524
+ output_attentions: Optional[bool] = None,
1525
+ output_hidden_states: Optional[bool] = None,
1526
+ return_dict: Optional[bool] = None,
1527
+ ) -> Union[Tuple, CLIPVisionModelOutput]:
1528
+ r"""
1529
+ Returns:
1530
+
1531
+ Examples:
1532
+
1533
+ ```python
1534
+ >>> from PIL import Image
1535
+ >>> import requests
1536
+ >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
1537
+
1538
+ >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
1539
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
1540
+
1541
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1542
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1543
+
1544
+ >>> inputs = processor(images=image, return_tensors="pt")
1545
+
1546
+ >>> outputs = model(**inputs)
1547
+ >>> image_embeds = outputs.image_embeds
1548
+ ```"""
1549
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1550
+
1551
+ vision_outputs = self.vision_model(
1552
+ pixel_values=pixel_values,
1553
+ output_attentions=output_attentions,
1554
+ output_hidden_states=output_hidden_states,
1555
+ return_dict=return_dict,
1556
+ )
1557
+
1558
+ pooled_output = vision_outputs[1] # pooled_output
1559
+
1560
+ image_embeds = self.visual_projection(pooled_output)
1561
+
1562
+ if not return_dict:
1563
+ outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
1564
+ return tuple(output for output in outputs if output is not None)
1565
+
1566
+ return CLIPVisionModelOutput(
1567
+ image_embeds=image_embeds,
1568
+ last_hidden_state=vision_outputs.last_hidden_state,
1569
+ hidden_states=vision_outputs.hidden_states,
1570
+ attentions=vision_outputs.attentions,
1571
+ )
1572
+
1573
+
1574
+ @add_start_docstrings(
1575
+ """
1576
+ CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
1577
+ the patch tokens) e.g. for ImageNet.
1578
+ """,
1579
+ CLIP_START_DOCSTRING,
1580
+ )
1581
+ class CLIPForImageClassification(CLIPPreTrainedModel):
1582
+ main_input_name = "pixel_values"
1583
+
1584
+ def __init__(self, config: CLIPConfig) -> None:
1585
+ super().__init__(config)
1586
+
1587
+ self.num_labels = config.num_labels
1588
+ vision_model = CLIPVisionModel._from_config(
1589
+ config.vision_config, attn_implementation=config._attn_implementation
1590
+ )
1591
+ self.vision_model = vision_model.vision_model
1592
+
1593
+ # Classifier head
1594
+ self.classifier = (
1595
+ nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
1596
+ )
1597
+
1598
+ # Initialize weights and apply final processing
1599
+ self.post_init()
1600
+
1601
+ @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
1602
+ @add_code_sample_docstrings(
1603
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
1604
+ output_type=ImageClassifierOutput,
1605
+ config_class=_CONFIG_FOR_DOC,
1606
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
1607
+ )
1608
+ def forward(
1609
+ self,
1610
+ pixel_values: Optional[torch.Tensor] = None,
1611
+ labels: Optional[torch.Tensor] = None,
1612
+ output_attentions: Optional[bool] = None,
1613
+ output_hidden_states: Optional[bool] = None,
1614
+ return_dict: Optional[bool] = None,
1615
+ ) -> Union[tuple, ImageClassifierOutput]:
1616
+ r"""
1617
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1618
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
1619
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1620
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1621
+ """
1622
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1623
+ output_hidden_states = (
1624
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1625
+ )
1626
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1627
+
1628
+ outputs = self.vision_model(
1629
+ pixel_values,
1630
+ output_attentions=output_attentions,
1631
+ output_hidden_states=output_hidden_states,
1632
+ return_dict=return_dict,
1633
+ )
1634
+
1635
+ sequence_output = outputs[0]
1636
+
1637
+ # average pool the patch tokens
1638
+ sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
1639
+ # apply classifier
1640
+ logits = self.classifier(sequence_output)
1641
+
1642
+ loss = None
1643
+ if labels is not None:
1644
+ # move labels to correct device to enable model parallelism
1645
+ labels = labels.to(logits.device)
1646
+ if self.config.problem_type is None:
1647
+ if self.num_labels == 1:
1648
+ self.config.problem_type = "regression"
1649
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1650
+ self.config.problem_type = "single_label_classification"
1651
+ else:
1652
+ self.config.problem_type = "multi_label_classification"
1653
+
1654
+ if self.config.problem_type == "regression":
1655
+ loss_fct = MSELoss()
1656
+ if self.num_labels == 1:
1657
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1658
+ else:
1659
+ loss = loss_fct(logits, labels)
1660
+ elif self.config.problem_type == "single_label_classification":
1661
+ loss_fct = CrossEntropyLoss()
1662
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1663
+ elif self.config.problem_type == "multi_label_classification":
1664
+ loss_fct = BCEWithLogitsLoss()
1665
+ loss = loss_fct(logits, labels)
1666
+
1667
+ if not return_dict:
1668
+ output = (logits,) + outputs[2:]
1669
+ return ((loss,) + output) if loss is not None else output
1670
+
1671
+ return ImageClassifierOutput(
1672
+ loss=loss,
1673
+ logits=logits,
1674
+ hidden_states=outputs.hidden_states,
1675
+ attentions=outputs.attentions,
1676
+ )
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "resample": 3,
18
+ "size": 224
19
+ }
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": {
3
+ "content": "<|endoftext|>",
4
+ "single_word": false,
5
+ "lstrip": false,
6
+ "rstrip": false,
7
+ "normalized": true,
8
+ "__type": "AddedToken"
9
+ },
10
+ "bos_token": {
11
+ "content": "<|startoftext|>",
12
+ "single_word": false,
13
+ "lstrip": false,
14
+ "rstrip": false,
15
+ "normalized": true,
16
+ "__type": "AddedToken"
17
+ },
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "single_word": false,
21
+ "lstrip": false,
22
+ "rstrip": false,
23
+ "normalized": true,
24
+ "__type": "AddedToken"
25
+ },
26
+ "pad_token": "<|endoftext|>",
27
+ "errors": "replace",
28
+ "add_prefix_space": false,
29
+ "do_lower_case": true,
30
+ "name_or_path": "openai/clip-vit-base-patch16",
31
+ "model_max_length": 77,
32
+ "special_tokens_map_file": "./special_tokens_map.json",
33
+ "tokenizer_class": "CLIPTokenizer"
34
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff