File size: 2,999 Bytes
2271450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
---
datasets:
- unicamp-dl/mmarco
language:
- pt
pipeline_tag: text2text-generation
base_model: unicamp-dl/ptt5-v2-large
---

## Introduction
MonoPTT5 models are T5 rerankers for the Portuguese language. Starting from [ptt5-v2 checkpoints](https://huggingface.co/collections/unicamp-dl/ptt5-v2-666538a650188ba00aa8d2d0), they were trained for 100k steps on a mixture of Portuguese and English data from the mMARCO dataset.
For further information on the training and evaluation of these models, please refer to our paper, [ptt5-v2: A Closer Look at Continued Pretraining of T5 Models for the Portuguese Language](https://arxiv.org/abs/2008.09144).

## Usage
The easiest way to use our models is through the `rerankers` package. After installing the package using `pip install rerankers[transformers]`, the following code can be used as a minimal working example:

```python
from rerankers import Reranker

query = "O futebol é uma paixão nacional"
docs = [
    "O futebol é superestimado e não deveria receber tanta atenção.",
    "O futebol é uma parte essencial da cultura brasileira e une as pessoas.",
]

ranker = Reranker(
    "unicamp-dl/monoptt5-small",
    inputs_template="Pergunta: {query} Documento: {text} Relevante:",
)
# Relevant logging:
# Loading T5Ranker model unicamp-dl/monoptt5-small
# No device set
# Using device cpu
# No dtype set
# Device set to `cpu`, setting dtype to `float32`
# Using dtype torch.float32
# Loading model unicamp-dl/monoptt5-small, this might take a while...
# Using device cpu.
# Using dtype torch.float32.
# T5 true token set to ▁Sim
# T5 false token set to ▁Não
# Returning normalised scores...
# Inputs template set to Pergunta: {query} Documento: {text} Relevante:

results = ranker.rerank(query, docs)
# Results should be something like (can vary depending on the model, the example below uses the "unicamp-dl/monoptt5-small" model)
RankedResults(
    results=[
        Result(
            document=Document(
                text="O futebol é uma parte essencial da cultura brasileira e une as pessoas.",
                doc_id=1,
                metadata={},
            ),
            score=0.91943359375,
            rank=1,
        ),
        Result(
            document=Document(
                text="O futebol é superestimado e não deveria receber tanta atenção.",
                doc_id=0,
                metadata={},
            ),
            score=0.0267486572265625,
            rank=2,
        ),
    ],
    query="O futebol é uma paixão nacional",
    has_scores=True,
)

```

For additional configurations and more advanced usage, consult the rerankers documentation.

# Citation
If you use our models, please cite:

    @article{ptt5_2020,
      title={PTT5: Pretraining and validating the T5 model on Brazilian Portuguese data},
      author={Carmo, Diedre and Piau, Marcos and Campiotti, Israel and Nogueira, Rodrigo and Lotufo, Roberto},
      journal={arXiv preprint arXiv:2008.09144},
      year={2020}
    }