geekyrakshit commited on
Commit
335e8a6
·
1 Parent(s): cce1c58

add: predict functions

Browse files
medrag_multi_modal/retrieval/bm25s_retrieval.py CHANGED
@@ -141,21 +141,6 @@ class BM25sRetriever(weave.Model):
141
  The results are returned as a list of dictionaries, each containing a chunk and
142
  its corresponding relevance score.
143
 
144
- !!! example "Example Usage"
145
- ```python
146
- import weave
147
- from dotenv import load_dotenv
148
-
149
- from medrag_multi_modal.retrieval import BM25sRetriever
150
-
151
- load_dotenv()
152
- weave.init(project_name="ml-colabs/medrag-multi-modal")
153
- retriever = BM25sRetriever.from_wandb_artifact(
154
- index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:v2"
155
- )
156
- retrieved_chunks = retriever.retrieve(query="What are Ribosomes?")
157
- ```
158
-
159
  Args:
160
  query (str): The input query string to search for relevant chunks.
161
  top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
@@ -177,3 +162,37 @@ class BM25sRetriever(weave.Model):
177
  ):
178
  retrieved_chunks.append({"chunk": chunk, "score": score})
179
  return retrieved_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  The results are returned as a list of dictionaries, each containing a chunk and
142
  its corresponding relevance score.
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  Args:
145
  query (str): The input query string to search for relevant chunks.
146
  top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
 
162
  ):
163
  retrieved_chunks.append({"chunk": chunk, "score": score})
164
  return retrieved_chunks
165
+
166
+ @weave.op()
167
+ def predict(self, query: str, top_k: int = 2):
168
+ """
169
+ Predicts the top-k most relevant chunks for a given query using the BM25 algorithm.
170
+
171
+ This function is a wrapper around the `retrieve` method. It takes an input query string,
172
+ tokenizes it using the BM25 tokenizer, and retrieves the top-k most relevant chunks from
173
+ the BM25 index. The results are returned as a list of dictionaries, each containing a chunk
174
+ and its corresponding relevance score.
175
+
176
+ !!! example "Example Usage"
177
+ ```python
178
+ import weave
179
+ from dotenv import load_dotenv
180
+
181
+ from medrag_multi_modal.retrieval import BM25sRetriever
182
+
183
+ load_dotenv()
184
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
185
+ retriever = BM25sRetriever.from_wandb_artifact(
186
+ index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:v2"
187
+ )
188
+ retrieved_chunks = retriever.predict(query="What are Ribosomes?")
189
+ ```
190
+
191
+ Args:
192
+ query (str): The input query string to search for relevant chunks.
193
+ top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
194
+
195
+ Returns:
196
+ list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
197
+ """
198
+ return self.retrieve(query, top_k)
medrag_multi_modal/retrieval/contriever_retrieval.py CHANGED
@@ -170,22 +170,6 @@ class ContrieverRetriever(weave.Model):
170
  cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
171
  are returned as a list of dictionaries, each containing a chunk and its corresponding score.
172
 
173
- !!! example "Example Usage"
174
- ```python
175
- import weave
176
- from dotenv import load_dotenv
177
-
178
- from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
179
-
180
- load_dotenv()
181
- weave.init(project_name="ml-colabs/medrag-multi-modal")
182
- retriever = ContrieverRetriever.from_wandb_artifact(
183
- chunk_dataset_name="grays-anatomy-chunks:v0",
184
- index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
185
- )
186
- scores = retriever.retrieve(query="What are Ribosomes?", metric=SimilarityMetric.COSINE)
187
- ```
188
-
189
  Args:
190
  query (str): The input query string to search for relevant chunks.
191
  top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
@@ -213,3 +197,44 @@ class ContrieverRetriever(weave.Model):
213
  }
214
  )
215
  return retrieved_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
171
  are returned as a list of dictionaries, each containing a chunk and its corresponding score.
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  Args:
174
  query (str): The input query string to search for relevant chunks.
175
  top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
 
197
  }
198
  )
199
  return retrieved_chunks
200
+
201
+ @weave.op()
202
+ def predict(
203
+ self,
204
+ query: str,
205
+ top_k: int = 2,
206
+ metric: SimilarityMetric = SimilarityMetric.COSINE,
207
+ ):
208
+ """
209
+ Predicts the top-k most relevant chunks for a given query using the specified similarity metric.
210
+
211
+ This function is a wrapper around the `retrieve` method. It takes an input query string,
212
+ retrieves the top-k most relevant chunks from the precomputed vector index based on the
213
+ specified similarity metric, and returns the results as a list of dictionaries, each containing
214
+ a chunk and its corresponding relevance score.
215
+
216
+ !!! example "Example Usage"
217
+ ```python
218
+ import weave
219
+ from dotenv import load_dotenv
220
+
221
+ from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
222
+
223
+ load_dotenv()
224
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
225
+ retriever = ContrieverRetriever.from_wandb_artifact(
226
+ chunk_dataset_name="grays-anatomy-chunks:v0",
227
+ index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
228
+ )
229
+ scores = retriever.predict(query="What are Ribosomes?", metric=SimilarityMetric.COSINE)
230
+ ```
231
+
232
+ Args:
233
+ query (str): The input query string to search for relevant chunks.
234
+ top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
235
+ metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
236
+
237
+ Returns:
238
+ list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
239
+ """
240
+ return self.retrieve(query, top_k, metric)
medrag_multi_modal/retrieval/medcpt_retrieval.py CHANGED
@@ -200,23 +200,6 @@ class MedCPTRetriever(weave.Model):
200
  cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
201
  are returned as a list of dictionaries, each containing a chunk and its corresponding score.
202
 
203
- !!! example "Example Usage"
204
- ```python
205
- import weave
206
- from dotenv import load_dotenv
207
-
208
- import wandb
209
- from medrag_multi_modal.retrieval import MedCPTRetriever
210
-
211
- load_dotenv()
212
- weave.init(project_name="ml-colabs/medrag-multi-modal")
213
- retriever = MedCPTRetriever.from_wandb_artifact(
214
- chunk_dataset_name="grays-anatomy-chunks:v0",
215
- index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
216
- )
217
- retriever.retrieve(query="What are Ribosomes?")
218
- ```
219
-
220
  Args:
221
  query (str): The input query string to search for relevant chunks.
222
  top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
@@ -253,3 +236,44 @@ class MedCPTRetriever(weave.Model):
253
  }
254
  )
255
  return retrieved_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
201
  are returned as a list of dictionaries, each containing a chunk and its corresponding score.
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  Args:
204
  query (str): The input query string to search for relevant chunks.
205
  top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
 
236
  }
237
  )
238
  return retrieved_chunks
239
+
240
+ @weave.op()
241
+ def predict(
242
+ self,
243
+ query: str,
244
+ top_k: int = 2,
245
+ metric: SimilarityMetric = SimilarityMetric.COSINE,
246
+ ):
247
+ """
248
+ Predicts the most relevant chunks for a given query.
249
+
250
+ This function uses the `retrieve` method to find the top-k relevant chunks
251
+ from the dataset based on the input query. It allows specifying the number
252
+ of top relevant chunks to retrieve and the similarity metric to use for scoring.
253
+
254
+ !!! example "Example Usage"
255
+ ```python
256
+ import weave
257
+ from dotenv import load_dotenv
258
+
259
+ import wandb
260
+ from medrag_multi_modal.retrieval import MedCPTRetriever
261
+
262
+ load_dotenv()
263
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
264
+ retriever = MedCPTRetriever.from_wandb_artifact(
265
+ chunk_dataset_name="grays-anatomy-chunks:v0",
266
+ index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
267
+ )
268
+ retriever.predict(query="What are Ribosomes?")
269
+ ```
270
+
271
+ Args:
272
+ query (str): The input query string to search for relevant chunks.
273
+ top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
274
+ metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
275
+
276
+ Returns:
277
+ list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
278
+ """
279
+ return self.retrieve(query, top_k, metric)
medrag_multi_modal/retrieval/nv_embed_2.py CHANGED
@@ -177,27 +177,6 @@ class NVEmbed2Retriever(weave.Model):
177
  cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
178
  are returned as a list of dictionaries, each containing a chunk and its corresponding score.
179
 
180
- !!! example "Example Usage"
181
- ```python
182
- import weave
183
- from dotenv import load_dotenv
184
-
185
- import wandb
186
- from medrag_multi_modal.retrieval import NVEmbed2Retriever
187
-
188
- load_dotenv()
189
- weave.init(project_name="ml-colabs/medrag-multi-modal")
190
- retriever = NVEmbed2Retriever(model_name="nvidia/NV-Embed-v2")
191
- retriever.index(
192
- chunk_dataset_name="grays-anatomy-chunks:v0",
193
- index_name="grays-anatomy-nvembed2",
194
- )
195
- retriever = NVEmbed2Retriever.from_wandb_artifact(
196
- chunk_dataset_name="grays-anatomy-chunks:v0",
197
- index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
198
- )
199
- ```
200
-
201
  Args:
202
  query (list[str]): The input query strings to search for relevant chunks.
203
  top_k (int, optional): The number of top relevant chunks to retrieve.
@@ -273,6 +252,7 @@ class NVEmbed2Retriever(weave.Model):
273
  list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
274
  """
275
  query = [
276
- f"Instruct: Given a question, retrieve passages that answer the question\nQuery: {query}"
 
277
  ]
278
  return self.retrieve(query, top_k, metric)
 
177
  cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
178
  are returned as a list of dictionaries, each containing a chunk and its corresponding score.
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  Args:
181
  query (list[str]): The input query strings to search for relevant chunks.
182
  top_k (int, optional): The number of top relevant chunks to retrieve.
 
252
  list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
253
  """
254
  query = [
255
+ f"""Instruct: Given a question, retrieve passages that answer the question
256
+ Query: {query}"""
257
  ]
258
  return self.retrieve(query, top_k, metric)