smilkov commited on
Commit
cd01a89
1 Parent(s): 38dbe3e

Upload folder using huggingface_hub

Browse files
Files changed (28) hide show
  1. .gitattributes +2 -0
  2. data/datasets/lilac/opus100-en-us-validation/config.yml +32 -0
  3. data/datasets/lilac/opus100-en-us-validation/data-00000-of-00001.parquet +0 -0
  4. data/datasets/lilac/opus100-en-us-validation/manifest.json +22 -0
  5. data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/hnsw.hnswlib.bin +3 -0
  6. data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/hnsw.lookup.pkl +0 -0
  7. data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/signal_manifest.json +37 -0
  8. data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/spans.pkl +0 -0
  9. data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/data-00000-of-00001.parquet +0 -0
  10. data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/signal_manifest.json +33 -0
  11. data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/data-00000-of-00001.parquet +0 -0
  12. data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/signal_manifest.json +38 -0
  13. data/datasets/lilac/opus100-en-us-validation/translation/en/pii/data-00000-of-00001.parquet +0 -0
  14. data/datasets/lilac/opus100-en-us-validation/translation/en/pii/signal_manifest.json +47 -0
  15. data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/data-00000-of-00001.parquet +0 -0
  16. data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/signal_manifest.json +61 -0
  17. data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/hnsw.hnswlib.bin +3 -0
  18. data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/hnsw.lookup.pkl +0 -0
  19. data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/signal_manifest.json +37 -0
  20. data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/spans.pkl +0 -0
  21. data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/data-00000-of-00001.parquet +0 -0
  22. data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/signal_manifest.json +33 -0
  23. data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/data-00000-of-00001.parquet +0 -0
  24. data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/signal_manifest.json +38 -0
  25. data/datasets/lilac/opus100-en-us-validation/translation/es/pii/data-00000-of-00001.parquet +0 -0
  26. data/datasets/lilac/opus100-en-us-validation/translation/es/pii/signal_manifest.json +47 -0
  27. data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/data-00000-of-00001.parquet +0 -0
  28. data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/signal_manifest.json +61 -0
.gitattributes CHANGED
@@ -102,3 +102,5 @@ data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.pa
102
  data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
103
  data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
104
  data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
102
  data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
103
  data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
104
  data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
105
+ data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
106
+ data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/opus100-en-us-validation/config.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - embedding: gte-small
3
+ path: [translation, en]
4
+ - embedding: gte-small
5
+ path: [translation, es]
6
+ name: opus100-en-us-validation
7
+ namespace: lilac
8
+ settings:
9
+ preferred_embedding: gte-small
10
+ ui:
11
+ media_paths:
12
+ - [translation, es]
13
+ - [translation, en]
14
+ signals:
15
+ - path: [translation, es]
16
+ signal: {signal_name: near_dup}
17
+ - path: [translation, es]
18
+ signal: {signal_name: pii}
19
+ - path: [translation, es]
20
+ signal: {signal_name: lang_detection}
21
+ - path: [translation, es]
22
+ signal: {signal_name: text_statistics}
23
+ - path: [translation, en]
24
+ signal: {signal_name: near_dup}
25
+ - path: [translation, en]
26
+ signal: {signal_name: text_statistics}
27
+ - path: [translation, en]
28
+ signal: {signal_name: pii}
29
+ - path: [translation, en]
30
+ signal: {signal_name: lang_detection}
31
+ source: {config_name: en-es, dataset_name: opus100, source_name: huggingface, split: validation}
32
+ tags: [machine-learning]
data/datasets/lilac/opus100-en-us-validation/data-00000-of-00001.parquet ADDED
Binary file (304 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/manifest.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "translation": {
8
+ "fields": {
9
+ "en": {
10
+ "dtype": "string"
11
+ },
12
+ "es": {
13
+ "dtype": "string"
14
+ }
15
+ }
16
+ },
17
+ "__hfsplit__": {
18
+ "dtype": "string"
19
+ }
20
+ }
21
+ }
22
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e9362c74b9a4c6d10bd1bc3243d268876a85d1b6f829ef01fc88e4a8f562cff
3
+ size 3397972
data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/hnsw.lookup.pkl ADDED
Binary file (96.7 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "translation.en.gte-small",
4
+ "data_schema": {
5
+ "fields": {
6
+ "translation": {
7
+ "fields": {
8
+ "en": {
9
+ "fields": {
10
+ "gte-small": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "embedding": {
14
+ "dtype": "embedding"
15
+ }
16
+ },
17
+ "dtype": "string_span"
18
+ },
19
+ "signal": {
20
+ "signal_name": "gte-small"
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "signal_name": "gte-small"
31
+ },
32
+ "enriched_path": [
33
+ "translation",
34
+ "en"
35
+ ],
36
+ "vector_store": "hnsw"
37
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/gte-small/spans.pkl ADDED
Binary file (96.3 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (70 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "lang_detection": {
13
+ "dtype": "string",
14
+ "signal": {
15
+ "split_by_paragraph": false,
16
+ "signal_name": "lang_detection"
17
+ }
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ },
25
+ "signal": {
26
+ "split_by_paragraph": false,
27
+ "signal_name": "lang_detection"
28
+ },
29
+ "enriched_path": [
30
+ "translation",
31
+ "en"
32
+ ]
33
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/data-00000-of-00001.parquet ADDED
Binary file (80 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "near_dup": {
13
+ "fields": {
14
+ "cluster_id": {
15
+ "dtype": "uint32",
16
+ "categorical": true
17
+ }
18
+ },
19
+ "signal": {
20
+ "threshold": 0.85,
21
+ "signal_name": "near_dup"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "threshold": 0.85,
32
+ "signal_name": "near_dup"
33
+ },
34
+ "enriched_path": [
35
+ "translation",
36
+ "en"
37
+ ]
38
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/pii/data-00000-of-00001.parquet ADDED
Binary file (71.7 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/pii/signal_manifest.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "pii": {
13
+ "fields": {
14
+ "emails": {
15
+ "repeated_field": {
16
+ "dtype": "string_span"
17
+ }
18
+ },
19
+ "ip_addresses": {
20
+ "repeated_field": {
21
+ "dtype": "string_span"
22
+ }
23
+ },
24
+ "secrets": {
25
+ "repeated_field": {
26
+ "dtype": "string_span"
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "signal_name": "pii"
32
+ }
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ },
40
+ "signal": {
41
+ "signal_name": "pii"
42
+ },
43
+ "enriched_path": [
44
+ "translation",
45
+ "en"
46
+ ]
47
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (84.2 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "text_statistics": {
13
+ "fields": {
14
+ "num_characters": {
15
+ "dtype": "int32"
16
+ },
17
+ "readability": {
18
+ "dtype": "float32"
19
+ },
20
+ "log(type_token_ratio)": {
21
+ "dtype": "float32"
22
+ },
23
+ "frac_non_ascii": {
24
+ "dtype": "float32",
25
+ "bins": [
26
+ [
27
+ "Low",
28
+ null,
29
+ 0.15
30
+ ],
31
+ [
32
+ "Medium",
33
+ 0.15,
34
+ 0.3
35
+ ],
36
+ [
37
+ "High",
38
+ 0.3,
39
+ null
40
+ ]
41
+ ]
42
+ }
43
+ },
44
+ "signal": {
45
+ "signal_name": "text_statistics"
46
+ }
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ },
54
+ "signal": {
55
+ "signal_name": "text_statistics"
56
+ },
57
+ "enriched_path": [
58
+ "translation",
59
+ "en"
60
+ ]
61
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:916f708d77ad590a289226863b983dbeb247a9d4400b9f938bd84f3fbbc338eb
3
+ size 3421560
data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/hnsw.lookup.pkl ADDED
Binary file (96.9 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "translation.es.gte-small",
4
+ "data_schema": {
5
+ "fields": {
6
+ "translation": {
7
+ "fields": {
8
+ "es": {
9
+ "fields": {
10
+ "gte-small": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "embedding": {
14
+ "dtype": "embedding"
15
+ }
16
+ },
17
+ "dtype": "string_span"
18
+ },
19
+ "signal": {
20
+ "signal_name": "gte-small"
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "signal_name": "gte-small"
31
+ },
32
+ "enriched_path": [
33
+ "translation",
34
+ "es"
35
+ ],
36
+ "vector_store": "hnsw"
37
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/gte-small/spans.pkl ADDED
Binary file (96.4 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (69.9 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "lang_detection": {
13
+ "dtype": "string",
14
+ "signal": {
15
+ "split_by_paragraph": false,
16
+ "signal_name": "lang_detection"
17
+ }
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ },
25
+ "signal": {
26
+ "split_by_paragraph": false,
27
+ "signal_name": "lang_detection"
28
+ },
29
+ "enriched_path": [
30
+ "translation",
31
+ "es"
32
+ ]
33
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/data-00000-of-00001.parquet ADDED
Binary file (80 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "near_dup": {
13
+ "fields": {
14
+ "cluster_id": {
15
+ "dtype": "uint32",
16
+ "categorical": true
17
+ }
18
+ },
19
+ "signal": {
20
+ "threshold": 0.85,
21
+ "signal_name": "near_dup"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "threshold": 0.85,
32
+ "signal_name": "near_dup"
33
+ },
34
+ "enriched_path": [
35
+ "translation",
36
+ "es"
37
+ ]
38
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/pii/data-00000-of-00001.parquet ADDED
Binary file (71.7 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/pii/signal_manifest.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "pii": {
13
+ "fields": {
14
+ "emails": {
15
+ "repeated_field": {
16
+ "dtype": "string_span"
17
+ }
18
+ },
19
+ "ip_addresses": {
20
+ "repeated_field": {
21
+ "dtype": "string_span"
22
+ }
23
+ },
24
+ "secrets": {
25
+ "repeated_field": {
26
+ "dtype": "string_span"
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "signal_name": "pii"
32
+ }
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ },
40
+ "signal": {
41
+ "signal_name": "pii"
42
+ },
43
+ "enriched_path": [
44
+ "translation",
45
+ "es"
46
+ ]
47
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (87.4 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "text_statistics": {
13
+ "fields": {
14
+ "num_characters": {
15
+ "dtype": "int32"
16
+ },
17
+ "readability": {
18
+ "dtype": "float32"
19
+ },
20
+ "log(type_token_ratio)": {
21
+ "dtype": "float32"
22
+ },
23
+ "frac_non_ascii": {
24
+ "dtype": "float32",
25
+ "bins": [
26
+ [
27
+ "Low",
28
+ null,
29
+ 0.15
30
+ ],
31
+ [
32
+ "Medium",
33
+ 0.15,
34
+ 0.3
35
+ ],
36
+ [
37
+ "High",
38
+ 0.3,
39
+ null
40
+ ]
41
+ ]
42
+ }
43
+ },
44
+ "signal": {
45
+ "signal_name": "text_statistics"
46
+ }
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ },
54
+ "signal": {
55
+ "signal_name": "text_statistics"
56
+ },
57
+ "enriched_path": [
58
+ "translation",
59
+ "es"
60
+ ]
61
+ }