VictorSanh commited on
Commit
b54299a
1 Parent(s): fb9884b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +94 -0
README.md CHANGED
@@ -1,3 +1,97 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ Same as https://huggingface.co/HuggingFaceM4/siglip-so400m-14-384-flash-attn2 with two changes:
6
+ - increase max resolution to 980 x 980 (instead of 384 x 384) by interpolating the position embeddings
7
+ - implement the strategy in [NaViT](https://arxiv.org/abs/2307.06304) to allow a/ variable resoltion images, b/ aspect ratio preserved images
8
+
9
+ These changes only apply to the vision tower. No changes to the text tower.
10
+ Implementation is fully backward compatible to `https://huggingface.co/HuggingFaceM4/siglip-so400m-14-384-flash-attn2` -> just don't specify the `patch_attention_mask`
11
+
12
+
13
+ Usage:
14
+ ```python
15
+ import torch
16
+ from modeling_siglip import SiglipVisionModel
17
+
18
+ DEVICE = torch.device("cuda:0")
19
+ PATCH_SIZE = 14
20
+
21
+ pixel_values = torch.randn(2, 3, 28, 42, dtype=torch.bfloat16, device=DEVICE)
22
+ pixel_attention_mask = [
23
+ [
24
+ [1] * 14 + [1] * 14 + [1] * 14,
25
+ [1] * 14 + [1] * 14 + [1] * 14,
26
+ [1] * 14 + [1] * 14 + [1] * 14,
27
+ [1] * 14 + [1] * 14 + [1] * 14,
28
+ [1] * 14 + [1] * 14 + [1] * 14,
29
+ [1] * 14 + [1] * 14 + [1] * 14,
30
+ [1] * 14 + [1] * 14 + [1] * 14,
31
+ [1] * 14 + [1] * 14 + [1] * 14,
32
+ [1] * 14 + [1] * 14 + [1] * 14,
33
+ [1] * 14 + [1] * 14 + [1] * 14,
34
+ [1] * 14 + [1] * 14 + [1] * 14,
35
+ [1] * 14 + [1] * 14 + [1] * 14,
36
+ [1] * 14 + [1] * 14 + [1] * 14,
37
+ [1] * 14 + [1] * 14 + [1] * 14,
38
+
39
+ [0] * 14 + [0] * 14 + [0] * 14,
40
+ [0] * 14 + [0] * 14 + [0] * 14,
41
+ [0] * 14 + [0] * 14 + [0] * 14,
42
+ [0] * 14 + [0] * 14 + [0] * 14,
43
+ [0] * 14 + [0] * 14 + [0] * 14,
44
+ [0] * 14 + [0] * 14 + [0] * 14,
45
+ [0] * 14 + [0] * 14 + [0] * 14,
46
+ [0] * 14 + [0] * 14 + [0] * 14,
47
+ [0] * 14 + [0] * 14 + [0] * 14,
48
+ [0] * 14 + [0] * 14 + [0] * 14,
49
+ [0] * 14 + [0] * 14 + [0] * 14,
50
+ [0] * 14 + [0] * 14 + [0] * 14,
51
+ [0] * 14 + [0] * 14 + [0] * 14,
52
+ [0] * 14 + [0] * 14 + [0] * 14,
53
+ ],
54
+ [
55
+ [1] * 14 + [1] * 14 + [0] * 14,
56
+ [1] * 14 + [1] * 14 + [0] * 14,
57
+ [1] * 14 + [1] * 14 + [0] * 14,
58
+ [1] * 14 + [1] * 14 + [0] * 14,
59
+ [1] * 14 + [1] * 14 + [0] * 14,
60
+ [1] * 14 + [1] * 14 + [0] * 14,
61
+ [1] * 14 + [1] * 14 + [0] * 14,
62
+ [1] * 14 + [1] * 14 + [0] * 14,
63
+ [1] * 14 + [1] * 14 + [0] * 14,
64
+ [1] * 14 + [1] * 14 + [0] * 14,
65
+ [1] * 14 + [1] * 14 + [0] * 14,
66
+ [1] * 14 + [1] * 14 + [0] * 14,
67
+ [1] * 14 + [1] * 14 + [0] * 14,
68
+ [1] * 14 + [1] * 14 + [0] * 14,
69
+
70
+ [1] * 14 + [1] * 14 + [0] * 14,
71
+ [1] * 14 + [1] * 14 + [0] * 14,
72
+ [1] * 14 + [1] * 14 + [0] * 14,
73
+ [1] * 14 + [1] * 14 + [0] * 14,
74
+ [1] * 14 + [1] * 14 + [0] * 14,
75
+ [1] * 14 + [1] * 14 + [0] * 14,
76
+ [1] * 14 + [1] * 14 + [0] * 14,
77
+ [1] * 14 + [1] * 14 + [0] * 14,
78
+ [1] * 14 + [1] * 14 + [0] * 14,
79
+ [1] * 14 + [1] * 14 + [0] * 14,
80
+ [1] * 14 + [1] * 14 + [0] * 14,
81
+ [1] * 14 + [1] * 14 + [0] * 14,
82
+ [1] * 14 + [1] * 14 + [0] * 14,
83
+ [1] * 14 + [1] * 14 + [0] * 14,
84
+ ],
85
+ ]
86
+ pixel_attention_mask = torch.tensor(pixel_attention_mask, dtype=torch.bool, device=DEVICE)
87
+ patches_subgrid = pixel_attention_mask.unfold(
88
+ dimension=1, size=PATCH_SIZE, step=PATCH_SIZE
89
+ ).unfold(dimension=2, size=PATCH_SIZE, step=PATCH_SIZE)
90
+ patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
91
+
92
+ model = SiglipVisionModel.from_pretrained("HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit", _flash_attn_2_enabled=True)
93
+ model.train()
94
+ model.vision_model.to(DEVICE, dtype=torch.bfloat16)
95
+
96
+ output = model.vision_model(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
97
+ ```