FlexRAG commited on
Commit
ab38f5c
·
verified ·
1 Parent(s): eb04b49

Update FlexRAG retriever

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ corpus.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ corpus.mmindex.json filter=lfs diff=lfs merge=lfs -text
38
+ vocab.index.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ library_name: FlexRAG
4
+ tags:
5
+ - FlexRAG
6
+ - retrieval
7
+ - search
8
+ - lexical
9
+ - RAG
10
+ ---
11
+
12
+ # FlexRAG Retriever
13
+
14
+ This is a BM25SRetriever created with the [`FlexRAG`](https://github.com/ictnlp/flexrag) library (version `0.1.8`).
15
+
16
+ ## Installation
17
+
18
+ You can install the `FlexRAG` library with `pip`:
19
+
20
+ ```bash
21
+ pip install flexrag
22
+ ```
23
+
24
+ ## Loading a `FlexRAG` retriever
25
+
26
+ You can use this retriever for information retrieval tasks. Here is an example:
27
+
28
+ ```python
29
+ from flexrag.retriever import LocalRetriever
30
+
31
+ # Load the retriever from the HuggingFace Hub
32
+ retriever = LocalRetriever.load_from_hub("FlexRAG/wiki2021_atlas_bm25s")
33
+
34
+ # You can retrieve now
35
+ results = retriever.search("Who is Bruce Wayne?")
36
+ ```
37
+
38
+ FlexRAG Related Links:
39
+ * 📚[Documentation](https://flexrag.readthedocs.io/en/latest/)
40
+ * 💻[GitHub Repository](https://github.com/ictnlp/flexrag)
config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_interval: 100
2
+ top_k: 10
3
+ batch_size: 32
4
+ query_preprocess_pipeline:
5
+ processor_type: []
6
+ length_filter_config:
7
+ max_tokens: null
8
+ min_tokens: null
9
+ max_chars: null
10
+ min_chars: null
11
+ max_bytes: null
12
+ min_bytes: null
13
+ tokenizer_config:
14
+ tokenizer_type: moses
15
+ hf_tokenizer_path: null
16
+ tiktok_tokenizer_name: null
17
+ lang: null
18
+ token_normalize_config:
19
+ lang: en
20
+ penn: true
21
+ norm_quote_commas: true
22
+ norm_numbers: true
23
+ pre_replace_unicode_punct: false
24
+ post_remove_control_chars: false
25
+ perl_parity: false
26
+ truncate_config:
27
+ max_chars: null
28
+ max_bytes: null
29
+ max_tokens: null
30
+ tokenizer_config:
31
+ tokenizer_type: moses
32
+ hf_tokenizer_path: null
33
+ tiktok_tokenizer_name: null
34
+ lang: null
35
+ database_path: ./bm25s_lucene
36
+ method: lucene
37
+ idf_method: null
38
+ backend: auto
39
+ k1: 1.5
40
+ b: 0.75
41
+ delta: 0.5
42
+ lang: english
43
+ indexed_fields: null
corpus.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef7c8f44c67ed805f0c41ee9cd267d76e310247664aea7a5cedaab4329fec71
3
+ size 22992342304
corpus.mmindex.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f61677c4e0b18a994bbe28d5e5fd54ed57a56108accf9219bb9a97259fb735d1
3
+ size 432224577
data.csc.index.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45bf59f9178ac72e917c20895c416bdadb6837b66539494f3db836ad1713bfaa
3
+ size 7294946272
indices.csc.index.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16865d9412728283ea48ec9e2409ff7fa8506ec9c40a3a18918ad3f31a377df9
3
+ size 7294946272
indptr.csc.index.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:732794e91b5565488702dcdf2aba1e1cd05e2d22ffd6b4dada666966cf4462d4
3
+ size 47865776
params.index.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "k1": 1.5,
3
+ "b": 0.75,
4
+ "delta": 0.5,
5
+ "method": "lucene",
6
+ "idf_method": "lucene",
7
+ "dtype": "float32",
8
+ "int_dtype": "int32",
9
+ "num_docs": 37507469,
10
+ "version": "0.2.1",
11
+ "backend": "numba"
12
+ }
retriever.id ADDED
@@ -0,0 +1 @@
 
 
1
+ BM25SRetriever
vocab.index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f8b43e07c3cc260486127f8cea0821d3c0b01de842158cd796f9980a86139d5
3
+ size 246450673