Upload README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,204 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
- it
|
6 |
+
- fr
|
7 |
+
- de
|
8 |
+
- es
|
9 |
+
base_model:
|
10 |
+
- MrLight/dse-qwen2-2b-mrl-v1
|
11 |
---
|
12 |
+
|
13 |
+
# mcdse-2b-v1
|
14 |
+
|
15 |
+
![](cover.png)
|
16 |
+
|
17 |
+
mcdse-2b-v1 is an experimental model designed for multilingual visual document retrieval.
|
18 |
+
|
19 |
+
This model allows you to embed page/slide screenshots and query them using natural language. Whether it's tables, graphs, charts, schemas, images, or text, mcdse-2b-v1 encodes everything into a single embedding vector, eliminating the need for traditional OCR, document layout analysis, reading order detection, chunking, table/formula extraction...
|
20 |
+
|
21 |
+
- **Understands ๐ฎ๐น Italian, ๐ช๐ธ Spanish, ๐ฌ๐ง English, ๐ซ๐ท French and ๐ฉ๐ช German**
|
22 |
+
|
23 |
+
- **Matryoshka Representation Learning:** shrink embeddings from 1536 to 256 dimensions while maintaining 95% of the quality. A 6x reduction with negligible impact on performance!
|
24 |
+
|
25 |
+
- **Top-tier Binarization**: 768-dimensional binary vectors retain 99% retrieval quality of the original 1536-dimensional float vectors. With binary vectors, you can encode **100 million multilingual pages in just 10GB**.
|
26 |
+
|
27 |
+
- **Fast vLLM inference:** run inference on vLLM and efficiently serve embeddings at scale, production ready.
|
28 |
+
|
29 |
+
For more information about this model or how it was trained, visit the [announcement blogpost](https://huggingface.co/blog/marco/announcing-mcdse-2b-v1).
|
30 |
+
|
31 |
+
## Usage
|
32 |
+
|
33 |
+
**Initialize model and processor**
|
34 |
+
```python
|
35 |
+
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
36 |
+
from PIL import Image
|
37 |
+
import torch
|
38 |
+
import math
|
39 |
+
|
40 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
41 |
+
'marco/mcdse-2b-v1',
|
42 |
+
attn_implementation="flash_attention_2",
|
43 |
+
torch_dtype=torch.bfloat16,
|
44 |
+
device_map="cuda:0"
|
45 |
+
).eval()
|
46 |
+
|
47 |
+
min_pixels = 1 * 28 * 28
|
48 |
+
max_pixels = 960 * 28 * 28
|
49 |
+
|
50 |
+
processor = AutoProcessor.from_pretrained(
|
51 |
+
'marco/mcdse-2b-v1',
|
52 |
+
min_pixels=min_pixels,
|
53 |
+
max_pixels=max_pixels
|
54 |
+
)
|
55 |
+
|
56 |
+
model.padding_side = "left"
|
57 |
+
processor.tokenizer.padding_side = "left"
|
58 |
+
|
59 |
+
document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
|
60 |
+
|
61 |
+
query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
|
62 |
+
```
|
63 |
+
|
64 |
+
**Encode queries**
|
65 |
+
```python
|
66 |
+
def encode_queries(queries: list[str], dimension: int):
|
67 |
+
dummy_image = Image.new('RGB', (56, 56))
|
68 |
+
inputs = processor(
|
69 |
+
text=[query_prompt % x for x in queries],
|
70 |
+
images=[dummy_image for _ in queries],
|
71 |
+
videos=None,
|
72 |
+
padding='longest',
|
73 |
+
return_tensors='pt'
|
74 |
+
).to('cuda:0')
|
75 |
+
|
76 |
+
cache_position = torch.arange(0, len(queries))
|
77 |
+
inputs = model.prepare_inputs_for_generation(
|
78 |
+
**inputs, cache_position=cache_position, use_cache=False)
|
79 |
+
|
80 |
+
with torch.no_grad():
|
81 |
+
output = self.model(
|
82 |
+
**inputs,
|
83 |
+
return_dict=True,
|
84 |
+
output_hidden_states=True
|
85 |
+
)
|
86 |
+
|
87 |
+
embeddings = output.hidden_states[-1][:, -1]
|
88 |
+
return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
|
89 |
+
```
|
90 |
+
|
91 |
+
**Encode documents**
|
92 |
+
```python
|
93 |
+
def round_by_factor(number: float, factor: int) -> int:
|
94 |
+
return round(number / factor) * factor
|
95 |
+
|
96 |
+
def ceil_by_factor(number: float, factor: int) -> int:
|
97 |
+
return math.ceil(number / factor) * factor
|
98 |
+
|
99 |
+
def floor_by_factor(number: float, factor: int) -> int:
|
100 |
+
return math.floor(number / factor) * factor
|
101 |
+
|
102 |
+
def smart_resize(height: int, width: int) -> tuple[int, int]:
|
103 |
+
h_bar = max(28, round_by_factor(height, 28))
|
104 |
+
w_bar = max(28, round_by_factor(width, 28))
|
105 |
+
if h_bar * w_bar > max_pixels:
|
106 |
+
beta = math.sqrt((height * width) / max_pixels)
|
107 |
+
h_bar = floor_by_factor(height / beta, 28)
|
108 |
+
w_bar = floor_by_factor(width / beta, 28)
|
109 |
+
elif h_bar * w_bar < min_pixels:
|
110 |
+
beta = math.sqrt(min_pixels / (height * width))
|
111 |
+
h_bar = ceil_by_factor(height * beta, 28)
|
112 |
+
w_bar = ceil_by_factor(width * beta, 28)
|
113 |
+
return h_bar, w_bar
|
114 |
+
|
115 |
+
def resize(image: Image.Image):
|
116 |
+
new_size = smart_resize(image.height, image.width)
|
117 |
+
return image.resize(new_size)
|
118 |
+
|
119 |
+
def encode_documents(documents: list[Image.Image], dimension: int):
|
120 |
+
inputs = processor(
|
121 |
+
text=[document_prompt] * len(documents),
|
122 |
+
images=[resize(x) for x in documents],
|
123 |
+
videos=None,
|
124 |
+
padding='longest',
|
125 |
+
return_tensors='pt'
|
126 |
+
).to('cuda:0')
|
127 |
+
|
128 |
+
cache_position = torch.arange(0, len(queries))
|
129 |
+
inputs = model.prepare_inputs_for_generation(
|
130 |
+
**inputs, cache_position=cache_position, use_cache=False)
|
131 |
+
|
132 |
+
with torch.no_grad():
|
133 |
+
output = self.model(
|
134 |
+
**inputs,
|
135 |
+
return_dict=True,
|
136 |
+
output_hidden_states=True
|
137 |
+
)
|
138 |
+
|
139 |
+
embeddings = output.hidden_states[-1][:, -1]
|
140 |
+
return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
|
141 |
+
```
|
142 |
+
|
143 |
+
### vLLM
|
144 |
+
This model supports vLLM, visit the [announcement blogpost](https://huggingface.co/blog/marco/announcing-mcdse-2b-v1#deployment) to know more.
|
145 |
+
|
146 |
+
## Results
|
147 |
+
Given the scarcity of publicly available datasets for multilingual document image retrieval, the model has been evaluated using a custom-built dataset. This eval dataset was specifically designed to benchmark the model's performance across various languages.
|
148 |
+
|
149 |
+
### NDCG@5 (float)
|
150 |
+
| | Average | English | Italian | Spanish | French | German |
|
151 |
+
|---------------------|------------|------------|------------|------------|------------|------------|
|
152 |
+
| **1536 dimensions** | | | | | | |
|
153 |
+
| dse-qwen2-2b-mrl-v1 | 79.5 | 79.2 | 80.2 | 77.9 | 80.6 | 79.6 |
|
154 |
+
| mcdse-2b-v1 | **82.2** | **80.8** | **81.2** | **80.7** | **84.5** | **83.8** |
|
155 |
+
| | **+3.28%** | **+1.98%** | **+1.23%** | **+3.47%** | **+4.62%** | **+5.01%** |
|
156 |
+
| **1024 dimensions** | | | | | | |
|
157 |
+
| dse-qwen2-2b-mrl-v1 | 78.3 | 78.8 | 78.5 | 76.5 | 80 | 77.5 |
|
158 |
+
| mcdse-2b-v1 | **81.7** | **80** | **80.2** | **80.1** | **84** | **84.3** |
|
159 |
+
| | **+4.23%** | **+1.75%** | **+2.12%** | **+4.49%** | **+4.76%** | **+8.07%** |
|
160 |
+
| **768 dimensions** | | | | | | |
|
161 |
+
| dse-qwen2-2b-mrl-v1 | 77.8 | 78.4 | 78.3 | 75.6 | 80.8 | 75.9 |
|
162 |
+
| mcdse-2b-v1 | **81.1** | **79.6** | **79.9** | **79.2** | **83.3** | **83.3** |
|
163 |
+
| | **+4.02%** | **+1.51%** | **+2.00%** | **+4.55%** | **+3.00%** | **+8.88%** |
|
164 |
+
| **512 dimensions** | | | | | | |
|
165 |
+
| dse-qwen2-2b-mrl-v1 | 76.2 | 77.6 | 75.9 | 73.1 | 79.2 | 75.2 |
|
166 |
+
| mcdse-2b-v1 | **79.3** | **78.5** | **79.1** | **75.8** | **81.4** | **81.7** |
|
167 |
+
| | **+3.91%** | **+1.15%** | **+4.05%** | **+3.56%** | **+2.70%** | **+7.96%** |
|
168 |
+
| **384 dimensions** | | | | | | |
|
169 |
+
| dse-qwen2-2b-mrl-v1 | 75.7 | 76.2 | 75.5 | 74.6 | 78.4 | 74 |
|
170 |
+
| mcdse-2b-v1 | **78.8** | **77.5** | **78.5** | **76.1** | **80.4** | **81.4** |
|
171 |
+
| | **+3.86%** | **+1.68%** | **+3.82%** | **+1.97%** | **+2.49%** | **+9.09%** |
|
172 |
+
| **256 dimensions** | | | | | | |
|
173 |
+
| dse-qwen2-2b-mrl-v1 | 73.5 | 74.5 | 73.6 | 70.6 | 74.8 | 73.8 |
|
174 |
+
| mcdse-2b-v1 | **78.1** | **78.5** | **77.6** | **76.2** | **80.1** | **77.9** |
|
175 |
+
| | **+5.89%** | **+5.10%** | **+5.15%** | **+7.35%** | **+6.62%** | **+5.26%** |
|
176 |
+
|
177 |
+
### NDCG@5 (binary)
|
178 |
+
| | Average | English | Italian | Spanish | French | German |
|
179 |
+
|---------------------|-------------|-------------|-------------|-------------|-------------|-------------|
|
180 |
+
| **1536 dimensions** | | | | | | |
|
181 |
+
| dse-qwen2-2b-mrl-v1 | 75.0 | 75.8 | 75.4 | 72.4 | 78.1 | 73.2 |
|
182 |
+
| mcdse-2b-v1 | **80.6** | **79.5** | **76.9** | **81.9** | **83.7** | **80.8** |
|
183 |
+
| | **+6.93%** | **+4.65%** | **+1.95%** | **+11.60%** | **+6.69%** | **+9.41%** |
|
184 |
+
| **1024 dimensions** | | | | | | |
|
185 |
+
| dse-qwen2-2b-mrl-v1 | 72.2 | 74.8 | 71 | 70.8 | 74.6 | 69.6 |
|
186 |
+
| mcdse-2b-v1 | **79.3** | **78.4** | **75.4** | **80.8** | **82.6** | **79.5** |
|
187 |
+
| | **+9.05%** | **+4.59%** | **+5.84%** | **+12.38%** | **+9.69%** | **+12.45%** |
|
188 |
+
| **768 dimensions** | | | | | | |
|
189 |
+
| dse-qwen2-2b-mrl-v1 | 70.1 | 71.7 | 69.3 | 69.8 | 73.7 | 65.9 |
|
190 |
+
| mcdse-2b-v1 | **78.8** | **77.1** | **75.4** | **80** | **83** | **78.5** |
|
191 |
+
| | **+11.07%** | **+7.00%** | **+8.09%** | **+12.75%** | **+11.20%** | **+16.05%** |
|
192 |
+
| **512 dimensions** | | | | | | |
|
193 |
+
| dse-qwen2-2b-mrl-v1 | 66.5 | 70 | 65.4 | 63.7 | 70.2 | 63 |
|
194 |
+
| mcdse-2b-v1 | **76.6** | **74.8** | **74.2** | **77.7** | **80.9** | **75.3** |
|
195 |
+
| | **+13.21%** | **+6.42%** | **+11.86%** | **+18.02%** | **+13.23%** | **+16.33%** |
|
196 |
+
| **384 dimensions** | | | | | | |
|
197 |
+
| dse-qwen2-2b-mrl-v1 | 61.1 | 62.7 | 58.5 | 58.6 | 65.1 | 60.8 |
|
198 |
+
| mcdse-2b-v1 | **74.3** | **74.5** | **71.4** | **77.2** | **75.2** | **73** |
|
199 |
+
| | **+17.67%** | **+15.84%** | **+18.07%** | **+24.09%** | **+13.43%** | **+16.71%** |
|
200 |
+
| **256 dimensions** | | | | | | |
|
201 |
+
| dse-qwen2-2b-mrl-v1 | 54.3 | 59 | 56.5 | 53.6 | 53 | 49.6 |
|
202 |
+
| mcdse-2b-v1 | **70.9** | **72.6** | **66.4** | **73.5** | **72.6** | **69.2** |
|
203 |
+
| | **+23.31%** | **+18.73%** | **+14.91%** | **+27.07%** | **+27.00%** | **+28.32%** |
|
204 |
+
|