|
--- |
|
language: |
|
- en |
|
license: mit |
|
--- |
|
|
|
This model was trained with [Neural-Cherche](https://github.com/raphaelsty/neural-cherche). You can find details on how to fine-tune it in the [Neural-Cherche](https://github.com/raphaelsty/neural-cherche) repository. |
|
|
|
This model is an `all-mpnet-base-v2` as a ColBERT. |
|
|
|
|
|
```sh |
|
pip install neural-cherche |
|
``` |
|
|
|
## Retriever |
|
|
|
```python |
|
from neural_cherche import models, retrieve |
|
import torch |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
batch_size = 32 |
|
|
|
documents = [ |
|
{"id": 0, "document": "Food"}, |
|
{"id": 1, "document": "Sports"}, |
|
{"id": 2, "document": "Cinema"}, |
|
] |
|
|
|
queries = ["Food", "Sports", "Cinema"] |
|
|
|
model = models.ColBERT( |
|
model_name_or_path="raphaelsty/neural-cherche-colbert", |
|
device=device, |
|
) |
|
|
|
retriever = retrieve.ColBERT( |
|
key="id", |
|
on=["document"], |
|
model=model, |
|
) |
|
|
|
documents_embeddings = retriever.encode_documents( |
|
documents=documents, |
|
batch_size=batch_size, |
|
) |
|
|
|
retriever = retriever.add( |
|
documents_embeddings=documents_embeddings, |
|
) |
|
|
|
queries_embeddings = retriever.encode_queries( |
|
queries=queries, |
|
batch_size=batch_size, |
|
) |
|
|
|
scores = retriever( |
|
queries_embeddings=queries_embeddings, |
|
batch_size=batch_size, |
|
k=3, |
|
) |
|
|
|
scores |
|
``` |
|
|
|
## Ranker |
|
|
|
```python |
|
from neural_cherche import models, rank, retrieve |
|
import torch |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
batch_size = 32 |
|
|
|
documents = [ |
|
{"id": "doc1", "title": "Paris", "text": "Paris is the capital of France."}, |
|
{"id": "doc2", "title": "Montreal", "text": "Montreal is the largest city in Quebec."}, |
|
{"id": "doc3", "title": "Bordeaux", "text": "Bordeaux in Southwestern France."}, |
|
] |
|
|
|
queries = [ |
|
"What is the capital of France?", |
|
"What is the largest city in Quebec?", |
|
"Where is Bordeaux?", |
|
] |
|
|
|
retriever = retrieve.TfIdf( |
|
key="id", |
|
on=["title", "text"], |
|
) |
|
|
|
model = models.ColBERT( |
|
model_name_or_path="raphaelsty/neural-cherche-colbert", |
|
device=device, |
|
) |
|
|
|
ranker = rank.ColBERT( |
|
key="id", |
|
on=["title", "text"], |
|
model=model |
|
) |
|
|
|
retriever_documents_embeddings = retriever.encode_documents( |
|
documents=documents, |
|
) |
|
|
|
retriever.add( |
|
documents_embeddings=retriever_documents_embeddings, |
|
) |
|
|
|
ranker_documents_embeddings = ranker.encode_documents( |
|
documents=documents, |
|
batch_size=batch_size, |
|
) |
|
|
|
retriever_queries_embeddings = retriever.encode_queries( |
|
queries=queries, |
|
) |
|
|
|
ranker_queries_embeddings = ranker.encode_queries( |
|
queries=queries, |
|
batch_size=batch_size, |
|
) |
|
|
|
candidates = retriever( |
|
queries_embeddings=retriever_queries_embeddings, |
|
k=1000, |
|
) |
|
|
|
scores = ranker( |
|
documents=candidates, |
|
queries_embeddings=ranker_queries_embeddings, |
|
documents_embeddings=ranker_documents_embeddings, |
|
k=100, |
|
batch_size=32, |
|
) |
|
|
|
scores |
|
``` |