Update README.md
Browse files
README.md
CHANGED
@@ -29,20 +29,7 @@ For example, DSE-QWen2-2b-MRL-V1 achieves **85.8** nDCG@5 on [ViDoRE](https://hu
|
|
29 |
|
30 |
|
31 |
## Note:
|
32 |
-
|
33 |
-
1. clone latest transformers, `git clone https://github.com/huggingface/transformers.git`
|
34 |
-
2. Fix a bug in `transformers/models/qwen2_vl/modeling_qwen2_vl.py` around line 1774
|
35 |
-
```
|
36 |
-
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
|
37 |
-
# change the if statement below to if cache_position is not None and cache_position[0] != 0:
|
38 |
-
if cache_position[0] != 0:
|
39 |
-
pixel_values = None
|
40 |
-
pixel_values_videos = None
|
41 |
-
```
|
42 |
-
3. Install latest transformers from source `pip install -e .`
|
43 |
-
4. `pip install qwen-vl-utils`
|
44 |
-
|
45 |
-
> QWen vision encoder may take high GPU memory if the input image is large. Adjust `'resized_height':680 , 'resized_width':680` (see below) to fit VRAM based on GPU resources.
|
46 |
|
47 |
## How to Use the Model
|
48 |
|
@@ -96,7 +83,8 @@ query_texts = [
|
|
96 |
]
|
97 |
query_image_inputs, query_video_inputs = process_vision_info(query_messages)
|
98 |
query_inputs = processor(text=query_texts, images=query_image_inputs, videos=query_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
99 |
-
|
|
|
100 |
with torch.no_grad():
|
101 |
output = model(**query_inputs, return_dict=True, output_hidden_states=True)
|
102 |
query_embeddings = get_embedding(output.hidden_states[-1], 1536) # adjust dimensionality for efficiency trade-off, e.g. 512
|
@@ -138,7 +126,8 @@ doc_texts = [
|
|
138 |
]
|
139 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
140 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
141 |
-
|
|
|
142 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
143 |
with torch.no_grad():
|
144 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
@@ -184,7 +173,8 @@ doc_texts = [
|
|
184 |
]
|
185 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
186 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
187 |
-
|
|
|
188 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
189 |
with torch.no_grad():
|
190 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
|
|
29 |
|
30 |
|
31 |
## Note:
|
32 |
+
QWen vision encoder may take high GPU memory if the input image is large. Adjust `'resized_height':680 , 'resized_width':680` (see below) to fit VRAM based on GPU resources.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
## How to Use the Model
|
35 |
|
|
|
83 |
]
|
84 |
query_image_inputs, query_video_inputs = process_vision_info(query_messages)
|
85 |
query_inputs = processor(text=query_texts, images=query_image_inputs, videos=query_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
86 |
+
cache_position = torch.arange(0, len(query_texts))
|
87 |
+
query_inputs = model.prepare_inputs_for_generation(**query_inputs, cache_position=cache_position, use_cache=False)
|
88 |
with torch.no_grad():
|
89 |
output = model(**query_inputs, return_dict=True, output_hidden_states=True)
|
90 |
query_embeddings = get_embedding(output.hidden_states[-1], 1536) # adjust dimensionality for efficiency trade-off, e.g. 512
|
|
|
126 |
]
|
127 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
128 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
129 |
+
cache_position = torch.arange(0, len(doc_texts))
|
130 |
+
doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, cache_position=cache_position, use_cache=False)
|
131 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
132 |
with torch.no_grad():
|
133 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
|
|
173 |
]
|
174 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
175 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
176 |
+
cache_position = torch.arange(0, len(doc_texts))
|
177 |
+
doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, cache_position=cache_position, use_cache=False)
|
178 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
179 |
with torch.no_grad():
|
180 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|