Commit
·
a52fa7d
1
Parent(s):
7fca0dc
January 2025 data update
Browse files- README.md +6 -6
- config.json +6 -6
- documents +2 -2
- embeddings +2 -2
README.md
CHANGED
@@ -8,14 +8,14 @@ library_name: txtai
|
|
8 |
tags:
|
9 |
- sentence-similarity
|
10 |
datasets:
|
11 |
-
- NeuML/wikipedia-
|
12 |
---
|
13 |
|
14 |
# Wikipedia txtai embeddings index
|
15 |
|
16 |
This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
|
17 |
|
18 |
-
This index is built from the [Wikipedia
|
19 |
|
20 |
It also uses [Wikipedia Page Views](https://dumps.wikimedia.org/other/pageviews/readme.html) data to add a `percentile` field. The `percentile` field can be used
|
21 |
to only match commonly visited pages.
|
@@ -65,7 +65,7 @@ Performance was evaluated using the [NDCG@10](https://en.wikipedia.org/wiki/Disc
|
|
65 |
|
66 |
## Build the index
|
67 |
|
68 |
-
The following steps show how to build this index. These scripts are using the latest data available as of
|
69 |
|
70 |
- Install required build dependencies
|
71 |
```bash
|
@@ -75,7 +75,7 @@ pip install ragdata mwparserfromhell
|
|
75 |
- Download and build pageviews database
|
76 |
```bash
|
77 |
mkdir -p pageviews/data
|
78 |
-
wget -P pageviews/data https://dumps.wikimedia.org/other/pageview_complete/monthly/
|
79 |
python -m ragdata.wikipedia.views -p en.wikipedia -v pageviews
|
80 |
```
|
81 |
|
@@ -85,7 +85,7 @@ python -m ragdata.wikipedia.views -p en.wikipedia -v pageviews
|
|
85 |
from datasets import load_dataset
|
86 |
|
87 |
# Data dump date from https://dumps.wikimedia.org/enwiki/
|
88 |
-
date = "
|
89 |
|
90 |
# Build and save dataset
|
91 |
ds = load_dataset("neuml/wikipedia", language="en", date=date)
|
@@ -95,7 +95,7 @@ ds.save_to_disk(f"wikipedia-{date}")
|
|
95 |
- Build txtai-wikipedia index
|
96 |
```bash
|
97 |
python -m ragdata.wikipedia.index \
|
98 |
-
-d wikipedia-
|
99 |
-o txtai-wikipedia \
|
100 |
-v pageviews/pageviews.sqlite
|
101 |
```
|
|
|
8 |
tags:
|
9 |
- sentence-similarity
|
10 |
datasets:
|
11 |
+
- NeuML/wikipedia-20250123
|
12 |
---
|
13 |
|
14 |
# Wikipedia txtai embeddings index
|
15 |
|
16 |
This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
|
17 |
|
18 |
+
This index is built from the [Wikipedia January 2025 dataset](https://huggingface.co/datasets/neuml/wikipedia-20250123). Only the first paragraph of the [lead section](https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Lead_section) from each article is included in the index. This is similar to an abstract of the article.
|
19 |
|
20 |
It also uses [Wikipedia Page Views](https://dumps.wikimedia.org/other/pageviews/readme.html) data to add a `percentile` field. The `percentile` field can be used
|
21 |
to only match commonly visited pages.
|
|
|
65 |
|
66 |
## Build the index
|
67 |
|
68 |
+
The following steps show how to build this index. These scripts are using the latest data available as of 2025-01-23, update as appropriate.
|
69 |
|
70 |
- Install required build dependencies
|
71 |
```bash
|
|
|
75 |
- Download and build pageviews database
|
76 |
```bash
|
77 |
mkdir -p pageviews/data
|
78 |
+
wget -P pageviews/data https://dumps.wikimedia.org/other/pageview_complete/monthly/2025/2025-01/pageviews-202501-user.bz2
|
79 |
python -m ragdata.wikipedia.views -p en.wikipedia -v pageviews
|
80 |
```
|
81 |
|
|
|
85 |
from datasets import load_dataset
|
86 |
|
87 |
# Data dump date from https://dumps.wikimedia.org/enwiki/
|
88 |
+
date = "20250123"
|
89 |
|
90 |
# Build and save dataset
|
91 |
ds = load_dataset("neuml/wikipedia", language="en", date=date)
|
|
|
95 |
- Build txtai-wikipedia index
|
96 |
```bash
|
97 |
python -m ragdata.wikipedia.index \
|
98 |
+
-d wikipedia-20250123 \
|
99 |
-o txtai-wikipedia \
|
100 |
-v pageviews/pageviews.sqlite
|
101 |
```
|
config.json
CHANGED
@@ -14,15 +14,15 @@
|
|
14 |
"content": true,
|
15 |
"dimensions": 768,
|
16 |
"backend": "faiss",
|
17 |
-
"offset":
|
18 |
"build": {
|
19 |
-
"create": "
|
20 |
-
"python": "3.
|
21 |
"settings": {
|
22 |
-
"components": "
|
23 |
},
|
24 |
"system": "Linux (x86_64)",
|
25 |
-
"txtai": "
|
26 |
},
|
27 |
-
"update": "
|
28 |
}
|
|
|
14 |
"content": true,
|
15 |
"dimensions": 768,
|
16 |
"backend": "faiss",
|
17 |
+
"offset": 6333058,
|
18 |
"build": {
|
19 |
+
"create": "2025-02-01T17:41:48Z",
|
20 |
+
"python": "3.9.21",
|
21 |
"settings": {
|
22 |
+
"components": "IVF2251,SQ8"
|
23 |
},
|
24 |
"system": "Linux (x86_64)",
|
25 |
+
"txtai": "8.2.0"
|
26 |
},
|
27 |
+
"update": "2025-02-01T17:41:48Z"
|
28 |
}
|
documents
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e05be4645d78cfc665f78a119528833a23b6aa2c4e0c91477988af4bdb917d5
|
3 |
+
size 3406143488
|
embeddings
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb24f6eeb967b08fdf83c7b6e698f4f2c3391d55ebbc903225428a9733b3b909
|
3 |
+
size 4921392416
|