Mitsua commited on
Commit
f195a50
1 Parent(s): 45c524d

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ artic_2024_attribution.csv filter=lfs diff=lfs merge=lfs -text
37
+ localized_narratives_attributon.csv filter=lfs diff=lfs merge=lfs -text
38
+ MET_2024_attribution.csv filter=lfs diff=lfs merge=lfs -text
39
+ Smithsonian_2024_attribution.csv filter=lfs diff=lfs merge=lfs -text
CMA_2024_attribution.csv ADDED
The diff for this file is too large to render. See raw diff
 
MET_2024_attribution.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:996459d4752d9766104e0647a56910f930246c944c4a7714f827cc3659359932
3
+ size 48452462
Smithsonian_2024_attribution.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63b16517b224aba2e37513077dd3d898f78a45e5fc253fd43316a31afd31e461
3
+ size 29680177
artic_2024_attribution.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:099c0ab6bc7de4ded56a24ccc371fc1b2dafc3fad1f4977658ca6cc8a74859e2
3
+ size 13484393
commons_ccpd_attribution_likes_CLIP.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:becfdfb89f2ba166154cb4816480fca7a775fed5dcbc44a44ed12d594a3abd4e
3
+ size 1240171963
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Mitsua/mitsua-japanese-clip-vit-b-16",
3
+ "architectures": [
4
+ "MitsuaJapaneseCLIPModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_mitsua_japanese_clip.MitsuaJapaneseCLIPConfig",
8
+ "AutoModel": "modeling_mitsua_japanese_clip.MitsuaJapaneseCLIPModel"
9
+ },
10
+ "initializer_factor": 1.0,
11
+ "logit_scale_init_value": 2.6592,
12
+ "model_type": "mitsua_japanese_clip",
13
+ "projection_dim": 768,
14
+ "text_config": {
15
+ "bos_token_id": -1,
16
+ "eos_token_id": 1,
17
+ "hidden_act": "gelu",
18
+ "layer_norm_eps": 1e-05,
19
+ "model_type": "siglip_text_model",
20
+ "vocab_size": 64000
21
+ },
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.44.2",
24
+ "vision_config": {
25
+ "hidden_act": "gelu",
26
+ "layer_norm_eps": 1e-06,
27
+ "model_type": "clip_vision_model",
28
+ "patch_size": 16,
29
+ "projection_dim": 768
30
+ }
31
+ }
configuration_mitsua_japanese_clip.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team. + Abstract Engine. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from transformers import PretrainedConfig, CLIPVisionConfig, SiglipTextConfig
16
+
17
+ class MitsuaJapaneseCLIPConfig(PretrainedConfig):
18
+ model_type = "mitsua_japanese_clip"
19
+
20
+ def __init__(
21
+ self,
22
+ text_config=None, vision_config=None,
23
+ projection_dim=512,
24
+ logit_scale_init_value=2.6592,
25
+ **kwargs,
26
+ ):
27
+ super().__init__(**kwargs)
28
+ if text_config is None:
29
+ text_config = {}
30
+ if vision_config is None:
31
+ vision_config = {}
32
+
33
+ self.vision_config = CLIPVisionConfig(**vision_config)
34
+ self.text_config = SiglipTextConfig(**text_config)
35
+
36
+ self.projection_dim = projection_dim
37
+ self.logit_scale_init_value = logit_scale_init_value
38
+ self.initializer_factor = 1.0
39
+
40
+
41
+ @classmethod
42
+ def from_vision_text_configs(
43
+ cls, vision_config: PretrainedConfig, text_config: PretrainedConfig, **kwargs
44
+ ):
45
+ r"""
46
+ Instantiate a [`VisionTextDualEncoderConfig`] (or a derived class) from text model configuration and vision
47
+ model configuration.
48
+ Returns:
49
+ [`VisionTextDualEncoderConfig`]: An instance of a configuration object
50
+ """
51
+
52
+ return cls(
53
+ vision_config=vision_config.to_dict(),
54
+ text_config=text_config.to_dict(),
55
+ **kwargs,
56
+ )
localized_narratives_attributon.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e05a0e2ac79f2ae82cfd6626b096cedac1c4fbc52a971929be074c3ae30a4868
3
+ size 78171033
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d77b26e0b9fe5972301abeeaf8d18872304d1308599d5aa72cd50315d55645ac
3
+ size 884995484
modeling_mitsua_japanese_clip.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team. + Abstract Engine. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import Optional, Tuple, Union
16
+
17
+ import torch
18
+ from torch import nn
19
+ from transformers import CLIPPreTrainedModel, CLIPVisionConfig, CLIPVisionModel, SiglipTextConfig, SiglipTextModel
20
+ from transformers.models.clip.modeling_clip import CLIPOutput,clip_loss
21
+ from .configuration_mitsua_japanese_clip import MitsuaJapaneseCLIPConfig
22
+
23
+ class MitsuaJapaneseCLIPModel(CLIPPreTrainedModel):
24
+ config_class = MitsuaJapaneseCLIPConfig
25
+ def __init__(self, config: MitsuaJapaneseCLIPConfig):
26
+ CLIPPreTrainedModel.__init__(self, config)
27
+
28
+ if not isinstance(config.text_config, SiglipTextConfig):
29
+ raise TypeError(
30
+ "config.text_config is expected to be of type SiglipTextConfig but is of type"
31
+ f" {type(config.text_config)}."
32
+ )
33
+
34
+ if not isinstance(config.vision_config, CLIPVisionConfig):
35
+ raise TypeError(
36
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
37
+ f" {type(config.vision_config)}."
38
+ )
39
+
40
+ text_config = config.text_config
41
+ vision_config = config.vision_config
42
+
43
+ self.projection_dim = config.projection_dim
44
+ self.text_embed_dim = text_config.hidden_size
45
+ self.vision_embed_dim = vision_config.hidden_size
46
+
47
+ text_model = SiglipTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
48
+ self.text_model = text_model.text_model
49
+
50
+ vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
51
+ self.vision_model = vision_model.vision_model
52
+
53
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
54
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
55
+
56
+ # Initialize weights and apply final processing
57
+ self.post_init()
58
+
59
+ def get_text_features(
60
+ self,
61
+ input_ids: Optional[torch.Tensor] = None,
62
+ attention_mask: Optional[torch.Tensor] = None,
63
+ position_ids: Optional[torch.Tensor] = None,
64
+ output_attentions: Optional[bool] = None,
65
+ output_hidden_states: Optional[bool] = None,
66
+ return_dict: Optional[bool] = None,
67
+ ) -> torch.FloatTensor:
68
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
69
+ output_hidden_states = (
70
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
71
+ )
72
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
73
+
74
+ text_outputs = self.text_model(
75
+ input_ids=input_ids,
76
+ attention_mask=attention_mask,
77
+ position_ids=position_ids,
78
+ output_attentions=output_attentions,
79
+ output_hidden_states=output_hidden_states,
80
+ return_dict=return_dict,
81
+ )
82
+
83
+ pooled_output = text_outputs[1]
84
+ return pooled_output
85
+
86
+ def get_image_features(
87
+ self,
88
+ pixel_values: Optional[torch.FloatTensor] = None,
89
+ output_attentions: Optional[bool] = None,
90
+ output_hidden_states: Optional[bool] = None,
91
+ return_dict: Optional[bool] = None,
92
+ ) -> torch.FloatTensor:
93
+ r"""
94
+ Returns:
95
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
96
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
97
+ Examples:
98
+ ```python
99
+ >>> from PIL import Image
100
+ >>> import requests
101
+ >>> from transformers import AutoProcessor, CLIPModel
102
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
103
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
104
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
105
+ >>> image = Image.open(requests.get(url, stream=True).raw)
106
+ >>> inputs = processor(images=image, return_tensors="pt")
107
+ >>> image_features = model.get_image_features(**inputs)
108
+ ```"""
109
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
110
+ output_attentions = (
111
+ output_attentions
112
+ if output_attentions is not None
113
+ else self.config.output_attentions
114
+ )
115
+ output_hidden_states = (
116
+ output_hidden_states
117
+ if output_hidden_states is not None
118
+ else self.config.output_hidden_states
119
+ )
120
+ return_dict = (
121
+ return_dict if return_dict is not None else self.config.use_return_dict
122
+ )
123
+
124
+ vision_outputs = self.vision_model(
125
+ pixel_values=pixel_values,
126
+ output_attentions=output_attentions,
127
+ output_hidden_states=output_hidden_states,
128
+ return_dict=return_dict,
129
+ )
130
+
131
+ pooled_output = vision_outputs[1]
132
+ image_features = self.visual_projection(pooled_output)
133
+
134
+ return image_features
135
+
136
+ def forward(
137
+ self,
138
+ input_ids: Optional[torch.LongTensor] = None,
139
+ pixel_values: Optional[torch.FloatTensor] = None,
140
+ attention_mask: Optional[torch.Tensor] = None,
141
+ position_ids: Optional[torch.LongTensor] = None,
142
+ return_loss: Optional[bool] = None,
143
+ output_attentions: Optional[bool] = None,
144
+ output_hidden_states: Optional[bool] = None,
145
+ return_dict: Optional[bool] = None,
146
+ ) -> Union[Tuple, CLIPOutput]:
147
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
148
+ output_attentions = (
149
+ output_attentions
150
+ if output_attentions is not None
151
+ else self.config.output_attentions
152
+ )
153
+ output_hidden_states = (
154
+ output_hidden_states
155
+ if output_hidden_states is not None
156
+ else self.config.output_hidden_states
157
+ )
158
+ return_dict = (
159
+ return_dict if return_dict is not None else self.config.use_return_dict
160
+ )
161
+
162
+ vision_outputs = self.vision_model(
163
+ pixel_values=pixel_values,
164
+ output_attentions=output_attentions,
165
+ output_hidden_states=output_hidden_states,
166
+ return_dict=return_dict,
167
+ )
168
+
169
+ text_outputs = self.text_model(
170
+ input_ids=input_ids,
171
+ attention_mask=attention_mask,
172
+ position_ids=position_ids,
173
+ output_attentions=output_attentions,
174
+ output_hidden_states=output_hidden_states,
175
+ return_dict=return_dict,
176
+ )
177
+
178
+ image_embeds = vision_outputs[1]
179
+ image_embeds = self.visual_projection(image_embeds)
180
+
181
+ text_embeds = text_outputs[1]
182
+
183
+ # normalized features
184
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
185
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
186
+
187
+ # cosine similarity as logits
188
+ logit_scale = self.logit_scale.exp()
189
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) * logit_scale.to(
190
+ text_embeds.device
191
+ )
192
+ logits_per_image = logits_per_text.t()
193
+
194
+ loss = None
195
+ if return_loss:
196
+ loss = clip_loss(logits_per_text)
197
+
198
+ if not return_dict:
199
+ output = (
200
+ logits_per_image,
201
+ logits_per_text,
202
+ text_embeds,
203
+ image_embeds,
204
+ text_outputs,
205
+ vision_outputs,
206
+ )
207
+ return ((loss,) + output) if loss is not None else output
208
+
209
+ return CLIPOutput(
210
+ loss=loss,
211
+ logits_per_image=logits_per_image,
212
+ logits_per_text=logits_per_text,
213
+ text_embeds=text_embeds,
214
+ image_embeds=image_embeds,
215
+ text_model_output=text_outputs,
216
+ vision_model_output=vision_outputs,
217
+ )
preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "SiglipImageProcessor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "processor_class": "SiglipProcessor",
17
+ "resample": 3,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 224,
21
+ "width": 224
22
+ }
23
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "</s>",
4
+ "lstrip": true,
5
+ "normalized": false,
6
+ "rstrip": true,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "</s>",
11
+ "lstrip": true,
12
+ "normalized": false,
13
+ "rstrip": true,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": true,
19
+ "normalized": false,
20
+ "rstrip": true,
21
+ "single_word": false
22
+ }
23
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2815e9a0834e5791c9b6e91c41e39d21d2823041da5c5896da9cae98eff4320b
3
+ size 1495058
stair_captions_attribution.csv ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "1": {
4
+ "content": "</s>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "2": {
12
+ "content": "<unk>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": true
18
+ }
19
+ },
20
+ "additional_special_tokens": [],
21
+ "clean_up_tokenization_spaces": true,
22
+ "do_lower_case": true,
23
+ "eos_token": "</s>",
24
+ "model_input_names": [
25
+ "input_ids"
26
+ ],
27
+ "model_max_length": 64,
28
+ "pad_token": "</s>",
29
+ "processor_class": "SiglipProcessor",
30
+ "sp_model_kwargs": {},
31
+ "tokenizer_class": "SiglipTokenizer",
32
+ "unk_token": "<unk>"
33
+ }