Spaces:
Sleeping
Sleeping
Commit
·
335e8a6
1
Parent(s):
cce1c58
add: predict functions
Browse files
medrag_multi_modal/retrieval/bm25s_retrieval.py
CHANGED
@@ -141,21 +141,6 @@ class BM25sRetriever(weave.Model):
|
|
141 |
The results are returned as a list of dictionaries, each containing a chunk and
|
142 |
its corresponding relevance score.
|
143 |
|
144 |
-
!!! example "Example Usage"
|
145 |
-
```python
|
146 |
-
import weave
|
147 |
-
from dotenv import load_dotenv
|
148 |
-
|
149 |
-
from medrag_multi_modal.retrieval import BM25sRetriever
|
150 |
-
|
151 |
-
load_dotenv()
|
152 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
153 |
-
retriever = BM25sRetriever.from_wandb_artifact(
|
154 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:v2"
|
155 |
-
)
|
156 |
-
retrieved_chunks = retriever.retrieve(query="What are Ribosomes?")
|
157 |
-
```
|
158 |
-
|
159 |
Args:
|
160 |
query (str): The input query string to search for relevant chunks.
|
161 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
@@ -177,3 +162,37 @@ class BM25sRetriever(weave.Model):
|
|
177 |
):
|
178 |
retrieved_chunks.append({"chunk": chunk, "score": score})
|
179 |
return retrieved_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
The results are returned as a list of dictionaries, each containing a chunk and
|
142 |
its corresponding relevance score.
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
Args:
|
145 |
query (str): The input query string to search for relevant chunks.
|
146 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
|
162 |
):
|
163 |
retrieved_chunks.append({"chunk": chunk, "score": score})
|
164 |
return retrieved_chunks
|
165 |
+
|
166 |
+
@weave.op()
|
167 |
+
def predict(self, query: str, top_k: int = 2):
|
168 |
+
"""
|
169 |
+
Predicts the top-k most relevant chunks for a given query using the BM25 algorithm.
|
170 |
+
|
171 |
+
This function is a wrapper around the `retrieve` method. It takes an input query string,
|
172 |
+
tokenizes it using the BM25 tokenizer, and retrieves the top-k most relevant chunks from
|
173 |
+
the BM25 index. The results are returned as a list of dictionaries, each containing a chunk
|
174 |
+
and its corresponding relevance score.
|
175 |
+
|
176 |
+
!!! example "Example Usage"
|
177 |
+
```python
|
178 |
+
import weave
|
179 |
+
from dotenv import load_dotenv
|
180 |
+
|
181 |
+
from medrag_multi_modal.retrieval import BM25sRetriever
|
182 |
+
|
183 |
+
load_dotenv()
|
184 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
185 |
+
retriever = BM25sRetriever.from_wandb_artifact(
|
186 |
+
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:v2"
|
187 |
+
)
|
188 |
+
retrieved_chunks = retriever.predict(query="What are Ribosomes?")
|
189 |
+
```
|
190 |
+
|
191 |
+
Args:
|
192 |
+
query (str): The input query string to search for relevant chunks.
|
193 |
+
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
194 |
+
|
195 |
+
Returns:
|
196 |
+
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
197 |
+
"""
|
198 |
+
return self.retrieve(query, top_k)
|
medrag_multi_modal/retrieval/contriever_retrieval.py
CHANGED
@@ -170,22 +170,6 @@ class ContrieverRetriever(weave.Model):
|
|
170 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
171 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
172 |
|
173 |
-
!!! example "Example Usage"
|
174 |
-
```python
|
175 |
-
import weave
|
176 |
-
from dotenv import load_dotenv
|
177 |
-
|
178 |
-
from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
|
179 |
-
|
180 |
-
load_dotenv()
|
181 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
182 |
-
retriever = ContrieverRetriever.from_wandb_artifact(
|
183 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
184 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
|
185 |
-
)
|
186 |
-
scores = retriever.retrieve(query="What are Ribosomes?", metric=SimilarityMetric.COSINE)
|
187 |
-
```
|
188 |
-
|
189 |
Args:
|
190 |
query (str): The input query string to search for relevant chunks.
|
191 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
@@ -213,3 +197,44 @@ class ContrieverRetriever(weave.Model):
|
|
213 |
}
|
214 |
)
|
215 |
return retrieved_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
171 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
Args:
|
174 |
query (str): The input query string to search for relevant chunks.
|
175 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
|
197 |
}
|
198 |
)
|
199 |
return retrieved_chunks
|
200 |
+
|
201 |
+
@weave.op()
|
202 |
+
def predict(
|
203 |
+
self,
|
204 |
+
query: str,
|
205 |
+
top_k: int = 2,
|
206 |
+
metric: SimilarityMetric = SimilarityMetric.COSINE,
|
207 |
+
):
|
208 |
+
"""
|
209 |
+
Predicts the top-k most relevant chunks for a given query using the specified similarity metric.
|
210 |
+
|
211 |
+
This function is a wrapper around the `retrieve` method. It takes an input query string,
|
212 |
+
retrieves the top-k most relevant chunks from the precomputed vector index based on the
|
213 |
+
specified similarity metric, and returns the results as a list of dictionaries, each containing
|
214 |
+
a chunk and its corresponding relevance score.
|
215 |
+
|
216 |
+
!!! example "Example Usage"
|
217 |
+
```python
|
218 |
+
import weave
|
219 |
+
from dotenv import load_dotenv
|
220 |
+
|
221 |
+
from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
|
222 |
+
|
223 |
+
load_dotenv()
|
224 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
225 |
+
retriever = ContrieverRetriever.from_wandb_artifact(
|
226 |
+
chunk_dataset_name="grays-anatomy-chunks:v0",
|
227 |
+
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
|
228 |
+
)
|
229 |
+
scores = retriever.predict(query="What are Ribosomes?", metric=SimilarityMetric.COSINE)
|
230 |
+
```
|
231 |
+
|
232 |
+
Args:
|
233 |
+
query (str): The input query string to search for relevant chunks.
|
234 |
+
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
235 |
+
metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
|
236 |
+
|
237 |
+
Returns:
|
238 |
+
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
239 |
+
"""
|
240 |
+
return self.retrieve(query, top_k, metric)
|
medrag_multi_modal/retrieval/medcpt_retrieval.py
CHANGED
@@ -200,23 +200,6 @@ class MedCPTRetriever(weave.Model):
|
|
200 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
201 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
202 |
|
203 |
-
!!! example "Example Usage"
|
204 |
-
```python
|
205 |
-
import weave
|
206 |
-
from dotenv import load_dotenv
|
207 |
-
|
208 |
-
import wandb
|
209 |
-
from medrag_multi_modal.retrieval import MedCPTRetriever
|
210 |
-
|
211 |
-
load_dotenv()
|
212 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
213 |
-
retriever = MedCPTRetriever.from_wandb_artifact(
|
214 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
215 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
|
216 |
-
)
|
217 |
-
retriever.retrieve(query="What are Ribosomes?")
|
218 |
-
```
|
219 |
-
|
220 |
Args:
|
221 |
query (str): The input query string to search for relevant chunks.
|
222 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
@@ -253,3 +236,44 @@ class MedCPTRetriever(weave.Model):
|
|
253 |
}
|
254 |
)
|
255 |
return retrieved_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
201 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
Args:
|
204 |
query (str): The input query string to search for relevant chunks.
|
205 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
|
236 |
}
|
237 |
)
|
238 |
return retrieved_chunks
|
239 |
+
|
240 |
+
@weave.op()
|
241 |
+
def predict(
|
242 |
+
self,
|
243 |
+
query: str,
|
244 |
+
top_k: int = 2,
|
245 |
+
metric: SimilarityMetric = SimilarityMetric.COSINE,
|
246 |
+
):
|
247 |
+
"""
|
248 |
+
Predicts the most relevant chunks for a given query.
|
249 |
+
|
250 |
+
This function uses the `retrieve` method to find the top-k relevant chunks
|
251 |
+
from the dataset based on the input query. It allows specifying the number
|
252 |
+
of top relevant chunks to retrieve and the similarity metric to use for scoring.
|
253 |
+
|
254 |
+
!!! example "Example Usage"
|
255 |
+
```python
|
256 |
+
import weave
|
257 |
+
from dotenv import load_dotenv
|
258 |
+
|
259 |
+
import wandb
|
260 |
+
from medrag_multi_modal.retrieval import MedCPTRetriever
|
261 |
+
|
262 |
+
load_dotenv()
|
263 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
264 |
+
retriever = MedCPTRetriever.from_wandb_artifact(
|
265 |
+
chunk_dataset_name="grays-anatomy-chunks:v0",
|
266 |
+
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
|
267 |
+
)
|
268 |
+
retriever.predict(query="What are Ribosomes?")
|
269 |
+
```
|
270 |
+
|
271 |
+
Args:
|
272 |
+
query (str): The input query string to search for relevant chunks.
|
273 |
+
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
274 |
+
metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
|
275 |
+
|
276 |
+
Returns:
|
277 |
+
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
278 |
+
"""
|
279 |
+
return self.retrieve(query, top_k, metric)
|
medrag_multi_modal/retrieval/nv_embed_2.py
CHANGED
@@ -177,27 +177,6 @@ class NVEmbed2Retriever(weave.Model):
|
|
177 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
178 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
179 |
|
180 |
-
!!! example "Example Usage"
|
181 |
-
```python
|
182 |
-
import weave
|
183 |
-
from dotenv import load_dotenv
|
184 |
-
|
185 |
-
import wandb
|
186 |
-
from medrag_multi_modal.retrieval import NVEmbed2Retriever
|
187 |
-
|
188 |
-
load_dotenv()
|
189 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
190 |
-
retriever = NVEmbed2Retriever(model_name="nvidia/NV-Embed-v2")
|
191 |
-
retriever.index(
|
192 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
193 |
-
index_name="grays-anatomy-nvembed2",
|
194 |
-
)
|
195 |
-
retriever = NVEmbed2Retriever.from_wandb_artifact(
|
196 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
197 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
|
198 |
-
)
|
199 |
-
```
|
200 |
-
|
201 |
Args:
|
202 |
query (list[str]): The input query strings to search for relevant chunks.
|
203 |
top_k (int, optional): The number of top relevant chunks to retrieve.
|
@@ -273,6 +252,7 @@ class NVEmbed2Retriever(weave.Model):
|
|
273 |
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
274 |
"""
|
275 |
query = [
|
276 |
-
f"Instruct: Given a question, retrieve passages that answer the question
|
|
|
277 |
]
|
278 |
return self.retrieve(query, top_k, metric)
|
|
|
177 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
178 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
Args:
|
181 |
query (list[str]): The input query strings to search for relevant chunks.
|
182 |
top_k (int, optional): The number of top relevant chunks to retrieve.
|
|
|
252 |
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
253 |
"""
|
254 |
query = [
|
255 |
+
f"""Instruct: Given a question, retrieve passages that answer the question
|
256 |
+
Query: {query}"""
|
257 |
]
|
258 |
return self.retrieve(query, top_k, metric)
|