MrLight commited on
Commit
adf727a
1 Parent(s): 0b628fa

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +7 -17
README.md CHANGED
@@ -29,20 +29,7 @@ For example, DSE-QWen2-2b-MRL-V1 achieves **85.8** nDCG@5 on [ViDoRE](https://hu
29
 
30
 
31
  ## Note:
32
- The following steps need to be done before running the code:
33
- 1. clone latest transformers, `git clone https://github.com/huggingface/transformers.git`
34
- 2. Fix a bug in `transformers/models/qwen2_vl/modeling_qwen2_vl.py` around line 1774
35
- ```
36
- position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
37
- # change the if statement below to if cache_position is not None and cache_position[0] != 0:
38
- if cache_position[0] != 0:
39
- pixel_values = None
40
- pixel_values_videos = None
41
- ```
42
- 3. Install latest transformers from source `pip install -e .`
43
- 4. `pip install qwen-vl-utils`
44
-
45
- > QWen vision encoder may take high GPU memory if the input image is large. Adjust `'resized_height':680 , 'resized_width':680` (see below) to fit VRAM based on GPU resources.
46
 
47
  ## How to Use the Model
48
 
@@ -96,7 +83,8 @@ query_texts = [
96
  ]
97
  query_image_inputs, query_video_inputs = process_vision_info(query_messages)
98
  query_inputs = processor(text=query_texts, images=query_image_inputs, videos=query_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
99
- query_inputs = model.prepare_inputs_for_generation(**query_inputs, use_cache=False)
 
100
  with torch.no_grad():
101
  output = model(**query_inputs, return_dict=True, output_hidden_states=True)
102
  query_embeddings = get_embedding(output.hidden_states[-1], 1536) # adjust dimensionality for efficiency trade-off, e.g. 512
@@ -138,7 +126,8 @@ doc_texts = [
138
  ]
139
  doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
140
  doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
141
- doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, use_cache=False)
 
142
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
143
  with torch.no_grad():
144
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
@@ -184,7 +173,8 @@ doc_texts = [
184
  ]
185
  doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
186
  doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
187
- doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, use_cache=False)
 
188
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
189
  with torch.no_grad():
190
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
 
29
 
30
 
31
  ## Note:
32
+ QWen vision encoder may take high GPU memory if the input image is large. Adjust `'resized_height':680 , 'resized_width':680` (see below) to fit VRAM based on GPU resources.
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  ## How to Use the Model
35
 
 
83
  ]
84
  query_image_inputs, query_video_inputs = process_vision_info(query_messages)
85
  query_inputs = processor(text=query_texts, images=query_image_inputs, videos=query_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
86
+ cache_position = torch.arange(0, len(query_texts))
87
+ query_inputs = model.prepare_inputs_for_generation(**query_inputs, cache_position=cache_position, use_cache=False)
88
  with torch.no_grad():
89
  output = model(**query_inputs, return_dict=True, output_hidden_states=True)
90
  query_embeddings = get_embedding(output.hidden_states[-1], 1536) # adjust dimensionality for efficiency trade-off, e.g. 512
 
126
  ]
127
  doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
128
  doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
129
+ cache_position = torch.arange(0, len(doc_texts))
130
+ doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, cache_position=cache_position, use_cache=False)
131
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
132
  with torch.no_grad():
133
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
 
173
  ]
174
  doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
175
  doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
176
+ cache_position = torch.arange(0, len(doc_texts))
177
+ doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, cache_position=cache_position, use_cache=False)
178
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
179
  with torch.no_grad():
180
  output = model(**doc_inputs, return_dict=True, output_hidden_states=True)