geekyrakshit commited on
Commit
3b25ef5
·
1 Parent(s): 05b69a5

update: docs for colpali and nv-embed-v2

Browse files
medrag_multi_modal/retrieval/colpali_retrieval.py CHANGED
@@ -21,55 +21,6 @@ class CalPaliRetriever(weave.Model):
21
  It can be initialized with a pre-trained model or from a specified W&B artifact. The class
22
  also provides methods to index new data and to predict/retrieve documents based on a query.
23
 
24
- !!! example "Indexing Data"
25
- First you need to install `Byaldi` library by Answer.ai.
26
-
27
- ```bash
28
- uv pip install Byaldi>=0.0.5
29
- ```
30
-
31
- Next, you can index the data by running the following code:
32
-
33
- ```python
34
- import wandb
35
- from medrag_multi_modal.retrieval import CalPaliRetriever
36
-
37
- wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="index")
38
- retriever = CalPaliRetriever()
39
- retriever.index(
40
- data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
41
- weave_dataset_name="grays-anatomy-images:v0",
42
- index_name="grays-anatomy",
43
- )
44
- ```
45
-
46
- !!! example "Retrieving Documents"
47
- First you need to install `Byaldi` library by Answer.ai.
48
-
49
- ```bash
50
- uv pip install Byaldi>=0.0.5
51
- ```
52
-
53
- Next, you can retrieve the documents by running the following code:
54
-
55
- ```python
56
- import weave
57
-
58
- import wandb
59
- from medrag_multi_modal.retrieval import CalPaliRetriever
60
-
61
- weave.init(project_name="ml-colabs/medrag-multi-modal")
62
- retriever = CalPaliRetriever.from_wandb_artifact(
63
- index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
64
- metadata_dataset_name="grays-anatomy-images:v0",
65
- data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
66
- )
67
- retriever.predict(
68
- query="which neurotransmitters convey information between Merkel cells and sensory afferents?",
69
- top_k=3,
70
- )
71
- ```
72
-
73
  Attributes:
74
  model_name (str): The name of the model to be used for retrieval.
75
  """
@@ -98,28 +49,55 @@ class CalPaliRetriever(weave.Model):
98
  if metadata_dataset_name
99
  else None
100
  )
 
 
 
 
101
 
102
- @classmethod
103
- def from_wandb_artifact(
104
- cls,
105
- index_artifact_name: str,
106
- metadata_dataset_name: str,
107
- data_artifact_name: str,
108
- ):
109
- from byaldi import RAGMultiModalModel
110
 
111
- index_artifact_dir = get_wandb_artifact(index_artifact_name, "colpali-index")
112
- data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
113
- docs_retrieval_model = RAGMultiModalModel.from_index(
114
- index_path=os.path.join(index_artifact_dir, "index")
115
- )
116
- return cls(
117
- docs_retrieval_model=docs_retrieval_model,
118
- metadata_dataset_name=metadata_dataset_name,
119
- data_artifact_dir=data_artifact_dir,
120
- )
121
 
122
- def index(self, data_artifact_name: str, weave_dataset_name: str, index_name: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
124
  self._docs_retrieval_model.index(
125
  input_path=data_artifact_dir,
@@ -138,6 +116,76 @@ class CalPaliRetriever(weave.Model):
138
  )
139
  artifact.save()
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  @weave.op()
142
  def predict(self, query: str, top_k: int = 3) -> list[dict[str, Any]]:
143
  """
@@ -147,6 +195,41 @@ class CalPaliRetriever(weave.Model):
147
  This function uses the document retrieval model to search for the most relevant
148
  documents based on the provided query. It returns a list of dictionaries, each
149
  containing the document image, document ID, and the relevance score.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  Args:
152
  query (str): The search query string.
 
21
  It can be initialized with a pre-trained model or from a specified W&B artifact. The class
22
  also provides methods to index new data and to predict/retrieve documents based on a query.
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  Attributes:
25
  model_name (str): The name of the model to be used for retrieval.
26
  """
 
49
  if metadata_dataset_name
50
  else None
51
  )
52
+
53
+ def index(self, data_artifact_name: str, weave_dataset_name: str, index_name: str):
54
+ """
55
+ Indexes a dataset of documents and saves the index as a Weave artifact.
56
 
57
+ This method retrieves a dataset of documents from a Weave artifact using the provided
58
+ data artifact name. It then indexes the documents using the document retrieval model
59
+ and assigns the specified index name. The index is stored locally without storing the
60
+ collection with the index and overwrites any existing index with the same name.
 
 
 
 
61
 
62
+ If a Weave run is active, the method creates a new Weave artifact with the specified
63
+ index name and type "colpali-index". It adds the local index directory to the artifact
64
+ and saves it to Weave, including metadata with the provided Weave dataset name.
65
+
66
+ !!! example "Indexing Data"
67
+ First you need to install `Byaldi` library by Answer.ai.
 
 
 
 
68
 
69
+ ```bash
70
+ uv pip install Byaldi>=0.0.5
71
+ ```
72
+
73
+ Next, you can index the data by running the following code:
74
+
75
+ ```python
76
+ import wandb
77
+ from medrag_multi_modal.retrieval import CalPaliRetriever
78
+
79
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="index")
80
+ retriever = CalPaliRetriever()
81
+ retriever.index(
82
+ data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
83
+ weave_dataset_name="grays-anatomy-images:v0",
84
+ index_name="grays-anatomy",
85
+ )
86
+ ```
87
+
88
+ ??? note "Optional Speedup using Flash Attention"
89
+ If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
90
+ installing the `flash-attn` package.
91
+
92
+ ```bash
93
+ uv pip install flash-attn --no-build-isolation
94
+ ```
95
+
96
+ Args:
97
+ data_artifact_name (str): The name of the Weave artifact containing the dataset.
98
+ weave_dataset_name (str): The name of the Weave dataset to include in the artifact metadata.
99
+ index_name (str): The name to assign to the created index.
100
+ """
101
  data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
102
  self._docs_retrieval_model.index(
103
  input_path=data_artifact_dir,
 
116
  )
117
  artifact.save()
118
 
119
+ @classmethod
120
+ def from_wandb_artifact(
121
+ cls,
122
+ index_artifact_name: str,
123
+ metadata_dataset_name: str,
124
+ data_artifact_name: str,
125
+ ):
126
+ """
127
+ Creates an instance of the class from Weights & Biases (wandb) artifacts.
128
+
129
+ This method retrieves the necessary artifacts from wandb to initialize the
130
+ ColPaliRetriever. It fetches the index artifact directory and the data artifact
131
+ directory using the provided artifact names. It then loads the document retrieval
132
+ model from the index path within the index artifact directory. Finally, it returns
133
+ an instance of the class initialized with the retrieved document retrieval model,
134
+ metadata dataset name, and data artifact directory.
135
+
136
+ !!! example "Retrieving Documents"
137
+ First you need to install `Byaldi` library by Answer.ai.
138
+
139
+ ```bash
140
+ uv pip install Byaldi>=0.0.5
141
+ ```
142
+
143
+ Next, you can retrieve the documents by running the following code:
144
+
145
+ ```python
146
+ import weave
147
+
148
+ import wandb
149
+ from medrag_multi_modal.retrieval import CalPaliRetriever
150
+
151
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
152
+ retriever = CalPaliRetriever.from_wandb_artifact(
153
+ index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
154
+ metadata_dataset_name="grays-anatomy-images:v0",
155
+ data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
156
+ )
157
+ ```
158
+
159
+ ??? note "Optional Speedup using Flash Attention"
160
+ If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
161
+ installing the `flash-attn` package.
162
+
163
+ ```bash
164
+ uv pip install flash-attn --no-build-isolation
165
+ ```
166
+
167
+ Args:
168
+ index_artifact_name (str): The name of the wandb artifact containing the index.
169
+ metadata_dataset_name (str): The name of the dataset containing metadata.
170
+ data_artifact_name (str): The name of the wandb artifact containing the data.
171
+
172
+ Returns:
173
+ An instance of the class initialized with the retrieved document retrieval model,
174
+ metadata dataset name, and data artifact directory.
175
+ """
176
+ from byaldi import RAGMultiModalModel
177
+
178
+ index_artifact_dir = get_wandb_artifact(index_artifact_name, "colpali-index")
179
+ data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
180
+ docs_retrieval_model = RAGMultiModalModel.from_index(
181
+ index_path=os.path.join(index_artifact_dir, "index")
182
+ )
183
+ return cls(
184
+ docs_retrieval_model=docs_retrieval_model,
185
+ metadata_dataset_name=metadata_dataset_name,
186
+ data_artifact_dir=data_artifact_dir,
187
+ )
188
+
189
  @weave.op()
190
  def predict(self, query: str, top_k: int = 3) -> list[dict[str, Any]]:
191
  """
 
195
  This function uses the document retrieval model to search for the most relevant
196
  documents based on the provided query. It returns a list of dictionaries, each
197
  containing the document image, document ID, and the relevance score.
198
+
199
+ !!! example "Retrieving Documents"
200
+ First you need to install `Byaldi` library by Answer.ai.
201
+
202
+ ```bash
203
+ uv pip install Byaldi>=0.0.5
204
+ ```
205
+
206
+ Next, you can retrieve the documents by running the following code:
207
+
208
+ ```python
209
+ import weave
210
+
211
+ import wandb
212
+ from medrag_multi_modal.retrieval import CalPaliRetriever
213
+
214
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
215
+ retriever = CalPaliRetriever.from_wandb_artifact(
216
+ index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
217
+ metadata_dataset_name="grays-anatomy-images:v0",
218
+ data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
219
+ )
220
+ retriever.predict(
221
+ query="which neurotransmitters convey information between Merkel cells and sensory afferents?",
222
+ top_k=3,
223
+ )
224
+ ```
225
+
226
+ ??? note "Optional Speedup using Flash Attention"
227
+ If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
228
+ installing the `flash-attn` package.
229
+
230
+ ```bash
231
+ uv pip install flash-attn --no-build-isolation
232
+ ```
233
 
234
  Args:
235
  query (str): The search query string.
medrag_multi_modal/retrieval/nv_embed_2.py CHANGED
@@ -83,6 +83,14 @@ class NVEmbed2Retriever(weave.Model):
83
  index_name="grays-anatomy-nvembed2",
84
  )
85
  ```
 
 
 
 
 
 
 
 
86
 
87
  Args:
88
  chunk_dataset_name (str): The name of the Weave dataset containing the text chunks
@@ -136,6 +144,14 @@ class NVEmbed2Retriever(weave.Model):
136
  index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
137
  )
138
  ```
 
 
 
 
 
 
 
 
139
 
140
  Args:
141
  chunk_dataset_name (str): The name of the Weave dataset containing the text chunks.
@@ -242,6 +258,14 @@ class NVEmbed2Retriever(weave.Model):
242
  )
243
  retriever.predict(query="What are Ribosomes?")
244
  ```
 
 
 
 
 
 
 
 
245
 
246
  Args:
247
  query (str): The input query string to search for relevant chunks.
 
83
  index_name="grays-anatomy-nvembed2",
84
  )
85
  ```
86
+
87
+ ??? note "Optional Speedup using Flash Attention"
88
+ If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
89
+ installing the `flash-attn` package.
90
+
91
+ ```bash
92
+ uv pip install flash-attn --no-build-isolation
93
+ ```
94
 
95
  Args:
96
  chunk_dataset_name (str): The name of the Weave dataset containing the text chunks
 
144
  index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
145
  )
146
  ```
147
+
148
+ ??? note "Optional Speedup using Flash Attention"
149
+ If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
150
+ installing the `flash-attn` package.
151
+
152
+ ```bash
153
+ uv pip install flash-attn --no-build-isolation
154
+ ```
155
 
156
  Args:
157
  chunk_dataset_name (str): The name of the Weave dataset containing the text chunks.
 
258
  )
259
  retriever.predict(query="What are Ribosomes?")
260
  ```
261
+
262
+ ??? note "Optional Speedup using Flash Attention"
263
+ If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
264
+ installing the `flash-attn` package.
265
+
266
+ ```bash
267
+ uv pip install flash-attn --no-build-isolation
268
+ ```
269
 
270
  Args:
271
  query (str): The input query string to search for relevant chunks.