smilkov commited on
Commit
5e8bdf2
1 Parent(s): 7dab750

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml +67 -0
  3. data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet +3 -0
  4. data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json +87 -0
  5. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin +3 -0
  6. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.lookup.pkl +0 -0
  7. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/signal_manifest.json +40 -0
  8. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/spans.pkl +0 -0
  9. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet +0 -0
  10. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json +36 -0
  11. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet +0 -0
  12. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json +41 -0
  13. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet +0 -0
  14. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json +50 -0
  15. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet +0 -0
  16. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json +64 -0
  17. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet +0 -0
  18. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json +36 -0
  19. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet +0 -0
  20. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json +41 -0
  21. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet +0 -0
  22. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json +50 -0
  23. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet +0 -0
  24. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json +64 -0
  25. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet +0 -0
  26. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json +36 -0
  27. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet +0 -0
  28. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json +41 -0
  29. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet +0 -0
  30. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json +50 -0
  31. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet +0 -0
  32. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json +64 -0
  33. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin +3 -0
  34. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.lookup.pkl +0 -0
  35. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/signal_manifest.json +32 -0
  36. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/spans.pkl +0 -0
  37. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet +0 -0
  38. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json +28 -0
  39. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet +0 -0
  40. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json +33 -0
  41. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet +0 -0
  42. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json +42 -0
  43. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet +0 -0
  44. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json +56 -0
  45. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet +0 -0
  46. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json +28 -0
  47. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet +0 -0
  48. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json +33 -0
  49. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet +0 -0
  50. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json +42 -0
.gitattributes CHANGED
@@ -76,3 +76,6 @@ data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet filter=
76
  data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
77
  data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
78
  data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
76
  data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
77
  data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
78
  data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
79
+ data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
80
+ data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
81
+ data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - embedding: gte-small
3
+ path: [new-context, value, '*']
4
+ - {embedding: gte-small, path: original-context}
5
+ name: databricks-dolly-15k-curated-en
6
+ namespace: lilac
7
+ settings:
8
+ preferred_embedding: gte-small
9
+ ui:
10
+ media_paths:
11
+ - original-instruction
12
+ - original-context
13
+ - original-response
14
+ - [new-instruction, value, '*']
15
+ - [new-context, value, '*']
16
+ - [new-response, value, '*']
17
+ signals:
18
+ - path: original-instruction
19
+ signal: {signal_name: near_dup}
20
+ - path: original-instruction
21
+ signal: {signal_name: text_statistics}
22
+ - path: original-instruction
23
+ signal: {signal_name: pii}
24
+ - path: original-instruction
25
+ signal: {signal_name: lang_detection}
26
+ - path: original-context
27
+ signal: {signal_name: near_dup}
28
+ - path: original-context
29
+ signal: {signal_name: text_statistics}
30
+ - path: original-context
31
+ signal: {signal_name: lang_detection}
32
+ - path: original-context
33
+ signal: {signal_name: pii}
34
+ - path: original-response
35
+ signal: {signal_name: near_dup}
36
+ - path: original-response
37
+ signal: {signal_name: text_statistics}
38
+ - path: original-response
39
+ signal: {signal_name: pii}
40
+ - path: original-response
41
+ signal: {signal_name: lang_detection}
42
+ - path: [new-instruction, value, '*']
43
+ signal: {signal_name: near_dup}
44
+ - path: [new-instruction, value, '*']
45
+ signal: {signal_name: text_statistics}
46
+ - path: [new-instruction, value, '*']
47
+ signal: {signal_name: pii}
48
+ - path: [new-instruction, value, '*']
49
+ signal: {signal_name: lang_detection}
50
+ - path: [new-context, value, '*']
51
+ signal: {signal_name: near_dup}
52
+ - path: [new-context, value, '*']
53
+ signal: {signal_name: text_statistics}
54
+ - path: [new-context, value, '*']
55
+ signal: {signal_name: lang_detection}
56
+ - path: [new-context, value, '*']
57
+ signal: {signal_name: pii}
58
+ - path: [new-response, value, '*']
59
+ signal: {signal_name: near_dup}
60
+ - path: [new-response, value, '*']
61
+ signal: {signal_name: text_statistics}
62
+ - path: [new-response, value, '*']
63
+ signal: {signal_name: pii}
64
+ - path: [new-response, value, '*']
65
+ signal: {signal_name: lang_detection}
66
+ source: {dataset_name: argilla/databricks-dolly-15k-curated-en, source_name: huggingface}
67
+ tags: [machine-learning]
data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad225b50d5880a097ea66eb4ca70fc529c0321cf8a5652bd8fbe7a638d016851
3
+ size 15882489
data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "id": {
8
+ "dtype": "string"
9
+ },
10
+ "category": {
11
+ "dtype": "string"
12
+ },
13
+ "original-instruction": {
14
+ "dtype": "string"
15
+ },
16
+ "original-context": {
17
+ "dtype": "string"
18
+ },
19
+ "original-response": {
20
+ "dtype": "string"
21
+ },
22
+ "new-instruction": {
23
+ "fields": {
24
+ "user_id": {
25
+ "repeated_field": {
26
+ "dtype": "string"
27
+ }
28
+ },
29
+ "value": {
30
+ "repeated_field": {
31
+ "dtype": "string"
32
+ }
33
+ },
34
+ "status": {
35
+ "repeated_field": {
36
+ "dtype": "string"
37
+ }
38
+ }
39
+ }
40
+ },
41
+ "new-context": {
42
+ "fields": {
43
+ "user_id": {
44
+ "repeated_field": {
45
+ "dtype": "string"
46
+ }
47
+ },
48
+ "value": {
49
+ "repeated_field": {
50
+ "dtype": "string"
51
+ }
52
+ },
53
+ "status": {
54
+ "repeated_field": {
55
+ "dtype": "string"
56
+ }
57
+ }
58
+ }
59
+ },
60
+ "new-response": {
61
+ "fields": {
62
+ "user_id": {
63
+ "repeated_field": {
64
+ "dtype": "string"
65
+ }
66
+ },
67
+ "value": {
68
+ "repeated_field": {
69
+ "dtype": "string"
70
+ }
71
+ },
72
+ "status": {
73
+ "repeated_field": {
74
+ "dtype": "string"
75
+ }
76
+ }
77
+ }
78
+ },
79
+ "external_id": {
80
+ "dtype": "string"
81
+ },
82
+ "__hfsplit__": {
83
+ "dtype": "string"
84
+ }
85
+ }
86
+ }
87
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c879460250e68b6195eed6b48afa2fa2a7b8127483a299818a13f82ed7fea8dc
3
+ size 32553584
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.lookup.pkl ADDED
Binary file (522 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "new-context.value.gte-small",
4
+ "data_schema": {
5
+ "fields": {
6
+ "new-context": {
7
+ "fields": {
8
+ "value": {
9
+ "repeated_field": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "gte-small"
33
+ },
34
+ "enriched_path": [
35
+ "new-context",
36
+ "value",
37
+ "*"
38
+ ],
39
+ "vector_store": "hnsw"
40
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/spans.pkl ADDED
Binary file (351 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (521 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-context.value.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-context": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
26
+ },
27
+ "signal": {
28
+ "split_by_paragraph": false,
29
+ "signal_name": "lang_detection"
30
+ },
31
+ "enriched_path": [
32
+ "new-context",
33
+ "value",
34
+ "*"
35
+ ]
36
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet ADDED
Binary file (550 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-context.value.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-context": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ }
31
+ },
32
+ "signal": {
33
+ "threshold": 0.85,
34
+ "signal_name": "near_dup"
35
+ },
36
+ "enriched_path": [
37
+ "new-context",
38
+ "value",
39
+ "*"
40
+ ]
41
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet ADDED
Binary file (519 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-context.value.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-context": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ }
40
+ }
41
+ },
42
+ "signal": {
43
+ "signal_name": "pii"
44
+ },
45
+ "enriched_path": [
46
+ "new-context",
47
+ "value",
48
+ "*"
49
+ ]
50
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (603 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-context.value.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-context": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
55
+ },
56
+ "signal": {
57
+ "signal_name": "text_statistics"
58
+ },
59
+ "enriched_path": [
60
+ "new-context",
61
+ "value",
62
+ "*"
63
+ ]
64
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (521 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-instruction.value.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-instruction": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
26
+ },
27
+ "signal": {
28
+ "split_by_paragraph": false,
29
+ "signal_name": "lang_detection"
30
+ },
31
+ "enriched_path": [
32
+ "new-instruction",
33
+ "value",
34
+ "*"
35
+ ]
36
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet ADDED
Binary file (602 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-instruction.value.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-instruction": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ }
31
+ },
32
+ "signal": {
33
+ "threshold": 0.85,
34
+ "signal_name": "near_dup"
35
+ },
36
+ "enriched_path": [
37
+ "new-instruction",
38
+ "value",
39
+ "*"
40
+ ]
41
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet ADDED
Binary file (519 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-instruction.value.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-instruction": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ }
40
+ }
41
+ },
42
+ "signal": {
43
+ "signal_name": "pii"
44
+ },
45
+ "enriched_path": [
46
+ "new-instruction",
47
+ "value",
48
+ "*"
49
+ ]
50
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (581 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-instruction.value.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-instruction": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
55
+ },
56
+ "signal": {
57
+ "signal_name": "text_statistics"
58
+ },
59
+ "enriched_path": [
60
+ "new-instruction",
61
+ "value",
62
+ "*"
63
+ ]
64
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (521 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-response.value.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-response": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
26
+ },
27
+ "signal": {
28
+ "split_by_paragraph": false,
29
+ "signal_name": "lang_detection"
30
+ },
31
+ "enriched_path": [
32
+ "new-response",
33
+ "value",
34
+ "*"
35
+ ]
36
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet ADDED
Binary file (603 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-response.value.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-response": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ }
31
+ },
32
+ "signal": {
33
+ "threshold": 0.85,
34
+ "signal_name": "near_dup"
35
+ },
36
+ "enriched_path": [
37
+ "new-response",
38
+ "value",
39
+ "*"
40
+ ]
41
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet ADDED
Binary file (520 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-response.value.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-response": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ }
40
+ }
41
+ },
42
+ "signal": {
43
+ "signal_name": "pii"
44
+ },
45
+ "enriched_path": [
46
+ "new-response",
47
+ "value",
48
+ "*"
49
+ ]
50
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (651 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "new-response.value.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "new-response": {
9
+ "fields": {
10
+ "value": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
55
+ },
56
+ "signal": {
57
+ "signal_name": "text_statistics"
58
+ },
59
+ "enriched_path": [
60
+ "new-response",
61
+ "value",
62
+ "*"
63
+ ]
64
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee5c4ca43663633f531a587438913cc15fecad5baed5fdce2a1c7bc97a6e9260
3
+ size 32775684
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.lookup.pkl ADDED
Binary file (488 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "original-context.gte-small",
4
+ "data_schema": {
5
+ "fields": {
6
+ "original-context": {
7
+ "fields": {
8
+ "gte-small": {
9
+ "repeated_field": {
10
+ "fields": {
11
+ "embedding": {
12
+ "dtype": "embedding"
13
+ }
14
+ },
15
+ "dtype": "string_span"
16
+ },
17
+ "signal": {
18
+ "signal_name": "gte-small"
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ },
25
+ "signal": {
26
+ "signal_name": "gte-small"
27
+ },
28
+ "enriched_path": [
29
+ "original-context"
30
+ ],
31
+ "vector_store": "hnsw"
32
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/spans.pkl ADDED
Binary file (347 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (521 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "original-context.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "original-context": {
9
+ "fields": {
10
+ "lang_detection": {
11
+ "dtype": "string",
12
+ "signal": {
13
+ "split_by_paragraph": false,
14
+ "signal_name": "lang_detection"
15
+ }
16
+ }
17
+ }
18
+ }
19
+ }
20
+ },
21
+ "signal": {
22
+ "split_by_paragraph": false,
23
+ "signal_name": "lang_detection"
24
+ },
25
+ "enriched_path": [
26
+ "original-context"
27
+ ]
28
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet ADDED
Binary file (550 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "original-context.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "original-context": {
9
+ "fields": {
10
+ "near_dup": {
11
+ "fields": {
12
+ "cluster_id": {
13
+ "dtype": "uint32",
14
+ "categorical": true
15
+ }
16
+ },
17
+ "signal": {
18
+ "threshold": 0.85,
19
+ "signal_name": "near_dup"
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ },
26
+ "signal": {
27
+ "threshold": 0.85,
28
+ "signal_name": "near_dup"
29
+ },
30
+ "enriched_path": [
31
+ "original-context"
32
+ ]
33
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet ADDED
Binary file (519 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "original-context.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "original-context": {
9
+ "fields": {
10
+ "pii": {
11
+ "fields": {
12
+ "emails": {
13
+ "repeated_field": {
14
+ "dtype": "string_span"
15
+ }
16
+ },
17
+ "ip_addresses": {
18
+ "repeated_field": {
19
+ "dtype": "string_span"
20
+ }
21
+ },
22
+ "secrets": {
23
+ "repeated_field": {
24
+ "dtype": "string_span"
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "pii"
30
+ }
31
+ }
32
+ }
33
+ }
34
+ }
35
+ },
36
+ "signal": {
37
+ "signal_name": "pii"
38
+ },
39
+ "enriched_path": [
40
+ "original-context"
41
+ ]
42
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (602 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "original-context.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "original-context": {
9
+ "fields": {
10
+ "text_statistics": {
11
+ "fields": {
12
+ "num_characters": {
13
+ "dtype": "int32"
14
+ },
15
+ "readability": {
16
+ "dtype": "float32"
17
+ },
18
+ "log(type_token_ratio)": {
19
+ "dtype": "float32"
20
+ },
21
+ "frac_non_ascii": {
22
+ "dtype": "float32",
23
+ "bins": [
24
+ [
25
+ "Low",
26
+ null,
27
+ 0.15
28
+ ],
29
+ [
30
+ "Medium",
31
+ 0.15,
32
+ 0.3
33
+ ],
34
+ [
35
+ "High",
36
+ 0.3,
37
+ null
38
+ ]
39
+ ]
40
+ }
41
+ },
42
+ "signal": {
43
+ "signal_name": "text_statistics"
44
+ }
45
+ }
46
+ }
47
+ }
48
+ }
49
+ },
50
+ "signal": {
51
+ "signal_name": "text_statistics"
52
+ },
53
+ "enriched_path": [
54
+ "original-context"
55
+ ]
56
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (521 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "original-instruction.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "original-instruction": {
9
+ "fields": {
10
+ "lang_detection": {
11
+ "dtype": "string",
12
+ "signal": {
13
+ "split_by_paragraph": false,
14
+ "signal_name": "lang_detection"
15
+ }
16
+ }
17
+ }
18
+ }
19
+ }
20
+ },
21
+ "signal": {
22
+ "split_by_paragraph": false,
23
+ "signal_name": "lang_detection"
24
+ },
25
+ "enriched_path": [
26
+ "original-instruction"
27
+ ]
28
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet ADDED
Binary file (602 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "original-instruction.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "original-instruction": {
9
+ "fields": {
10
+ "near_dup": {
11
+ "fields": {
12
+ "cluster_id": {
13
+ "dtype": "uint32",
14
+ "categorical": true
15
+ }
16
+ },
17
+ "signal": {
18
+ "threshold": 0.85,
19
+ "signal_name": "near_dup"
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ },
26
+ "signal": {
27
+ "threshold": 0.85,
28
+ "signal_name": "near_dup"
29
+ },
30
+ "enriched_path": [
31
+ "original-instruction"
32
+ ]
33
+ }
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet ADDED
Binary file (519 kB). View file
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "original-instruction.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "original-instruction": {
9
+ "fields": {
10
+ "pii": {
11
+ "fields": {
12
+ "emails": {
13
+ "repeated_field": {
14
+ "dtype": "string_span"
15
+ }
16
+ },
17
+ "ip_addresses": {
18
+ "repeated_field": {
19
+ "dtype": "string_span"
20
+ }
21
+ },
22
+ "secrets": {
23
+ "repeated_field": {
24
+ "dtype": "string_span"
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "pii"
30
+ }
31
+ }
32
+ }
33
+ }
34
+ }
35
+ },
36
+ "signal": {
37
+ "signal_name": "pii"
38
+ },
39
+ "enriched_path": [
40
+ "original-instruction"
41
+ ]
42
+ }