diff --git a/.gitattributes b/.gitattributes index ef1f27a4451358fdfdd835d0095c5495a5a4fc58..8c6b597be6dab383c754a47886dcbcaf3d18c8e3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -76,3 +76,6 @@ data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet filter= data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text +data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text +data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text +data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml b/data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..a3ec826ab6dab3fb081485f72fbef82bc93b9bdd --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml @@ -0,0 +1,67 @@ +embeddings: +- embedding: gte-small + path: [new-context, value, '*'] +- {embedding: gte-small, path: original-context} +name: databricks-dolly-15k-curated-en +namespace: lilac +settings: + preferred_embedding: gte-small + ui: + media_paths: + - original-instruction + - original-context + - original-response + - [new-instruction, value, '*'] + - [new-context, value, '*'] + - [new-response, value, '*'] +signals: +- path: original-instruction + signal: {signal_name: near_dup} +- path: original-instruction + signal: {signal_name: text_statistics} +- path: original-instruction + signal: {signal_name: pii} +- path: original-instruction + signal: {signal_name: lang_detection} +- path: original-context + signal: {signal_name: near_dup} +- path: original-context + signal: {signal_name: text_statistics} +- path: original-context + signal: {signal_name: lang_detection} +- path: original-context + signal: {signal_name: pii} +- path: original-response + signal: {signal_name: near_dup} +- path: original-response + signal: {signal_name: text_statistics} +- path: original-response + signal: {signal_name: pii} +- path: original-response + signal: {signal_name: lang_detection} +- path: [new-instruction, value, '*'] + signal: {signal_name: near_dup} +- path: [new-instruction, value, '*'] + signal: {signal_name: text_statistics} +- path: [new-instruction, value, '*'] + signal: {signal_name: pii} +- path: [new-instruction, value, '*'] + signal: {signal_name: lang_detection} +- path: [new-context, value, '*'] + signal: {signal_name: near_dup} +- path: [new-context, value, '*'] + signal: {signal_name: text_statistics} +- path: [new-context, value, '*'] + signal: {signal_name: lang_detection} +- path: [new-context, value, '*'] + signal: {signal_name: pii} +- path: [new-response, value, '*'] + signal: {signal_name: near_dup} +- path: [new-response, value, '*'] + signal: {signal_name: text_statistics} +- path: [new-response, value, '*'] + signal: {signal_name: pii} +- path: [new-response, value, '*'] + signal: {signal_name: lang_detection} +source: {dataset_name: argilla/databricks-dolly-15k-curated-en, source_name: huggingface} +tags: [machine-learning] diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6f3e495af1e7f4ddfd11c5992863d7b9d056d3f5 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad225b50d5880a097ea66eb4ca70fc529c0321cf8a5652bd8fbe7a638d016851 +size 15882489 diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..99f8429981b50b56ce5a88c70dddf0f36d8b3a5c --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json @@ -0,0 +1,87 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "data_schema": { + "fields": { + "id": { + "dtype": "string" + }, + "category": { + "dtype": "string" + }, + "original-instruction": { + "dtype": "string" + }, + "original-context": { + "dtype": "string" + }, + "original-response": { + "dtype": "string" + }, + "new-instruction": { + "fields": { + "user_id": { + "repeated_field": { + "dtype": "string" + } + }, + "value": { + "repeated_field": { + "dtype": "string" + } + }, + "status": { + "repeated_field": { + "dtype": "string" + } + } + } + }, + "new-context": { + "fields": { + "user_id": { + "repeated_field": { + "dtype": "string" + } + }, + "value": { + "repeated_field": { + "dtype": "string" + } + }, + "status": { + "repeated_field": { + "dtype": "string" + } + } + } + }, + "new-response": { + "fields": { + "user_id": { + "repeated_field": { + "dtype": "string" + } + }, + "value": { + "repeated_field": { + "dtype": "string" + } + }, + "status": { + "repeated_field": { + "dtype": "string" + } + } + } + }, + "external_id": { + "dtype": "string" + }, + "__hfsplit__": { + "dtype": "string" + } + } + } +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ea0ef3e140e7d3674c977bef84f17316f9a0c2b --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.hnswlib.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c879460250e68b6195eed6b48afa2fa2a7b8127483a299818a13f82ed7fea8dc +size 32553584 diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.lookup.pkl new file mode 100644 index 0000000000000000000000000000000000000000..74ea8216355f68c73ae211d5ffc91ae0374f2015 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/hnsw.lookup.pkl differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..1e732b5c74348a7acd1dec80264dcef49e5074ec --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/signal_manifest.json @@ -0,0 +1,40 @@ +{ + "files": [], + "parquet_id": "new-context.value.gte-small", + "data_schema": { + "fields": { + "new-context": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "gte-small": { + "repeated_field": { + "fields": { + "embedding": { + "dtype": "embedding" + } + }, + "dtype": "string_span" + }, + "signal": { + "signal_name": "gte-small" + } + } + } + } + } + } + } + } + }, + "signal": { + "signal_name": "gte-small" + }, + "enriched_path": [ + "new-context", + "value", + "*" + ], + "vector_store": "hnsw" +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/spans.pkl b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/spans.pkl new file mode 100644 index 0000000000000000000000000000000000000000..126a2fab4e33e0f10051dd764af73567bf6a4953 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/gte-small/spans.pkl differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b8d1499ffb19d8d2f17c22a5bbe601834aeace40 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..c11f8548ff603d6eb0d15c29932722551e1d3b37 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json @@ -0,0 +1,36 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-context.value.lang_detection", + "data_schema": { + "fields": { + "new-context": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "lang_detection": { + "dtype": "string", + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + } + } + } + } + } + } + } + } + }, + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + }, + "enriched_path": [ + "new-context", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4158d72ec64ad8b8545ddcbd8ef847fa1f417de0 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..4073c8f8ac113853212b8f6044be10dd76d9f812 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json @@ -0,0 +1,41 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-context.value.near_dup", + "data_schema": { + "fields": { + "new-context": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "near_dup": { + "fields": { + "cluster_id": { + "dtype": "uint32", + "categorical": true + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + } + } + } + } + } + } + } + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + }, + "enriched_path": [ + "new-context", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..60eba5cc8eb5270f6f99e3582907dff1418899ea Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..72078ebd95cefffaebae75f16ebedf8a826ac169 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json @@ -0,0 +1,50 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-context.value.pii", + "data_schema": { + "fields": { + "new-context": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "pii": { + "fields": { + "emails": { + "repeated_field": { + "dtype": "string_span" + } + }, + "ip_addresses": { + "repeated_field": { + "dtype": "string_span" + } + }, + "secrets": { + "repeated_field": { + "dtype": "string_span" + } + } + }, + "signal": { + "signal_name": "pii" + } + } + } + } + } + } + } + } + }, + "signal": { + "signal_name": "pii" + }, + "enriched_path": [ + "new-context", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..59bd2f1f595ad324d3270f98da46d389688b7d49 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..3478c83a2bddeb2a2a0327af7111c9df9da8a382 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json @@ -0,0 +1,64 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-context.value.text_statistics", + "data_schema": { + "fields": { + "new-context": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "text_statistics": { + "fields": { + "num_characters": { + "dtype": "int32" + }, + "readability": { + "dtype": "float32" + }, + "log(type_token_ratio)": { + "dtype": "float32" + }, + "frac_non_ascii": { + "dtype": "float32", + "bins": [ + [ + "Low", + null, + 0.15 + ], + [ + "Medium", + 0.15, + 0.3 + ], + [ + "High", + 0.3, + null + ] + ] + } + }, + "signal": { + "signal_name": "text_statistics" + } + } + } + } + } + } + } + } + }, + "signal": { + "signal_name": "text_statistics" + }, + "enriched_path": [ + "new-context", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..98ba124f0a0bd25d7813652dfa354cba6efc4804 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..f98dac730493a7766689f497fea10f3828cc452a --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json @@ -0,0 +1,36 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-instruction.value.lang_detection", + "data_schema": { + "fields": { + "new-instruction": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "lang_detection": { + "dtype": "string", + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + } + } + } + } + } + } + } + } + }, + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + }, + "enriched_path": [ + "new-instruction", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f2fd29976a8f58808f740eca1b5dcf0b75c9e8f6 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..8515abb66bc3386682861b3169001a9c3723781b --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json @@ -0,0 +1,41 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-instruction.value.near_dup", + "data_schema": { + "fields": { + "new-instruction": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "near_dup": { + "fields": { + "cluster_id": { + "dtype": "uint32", + "categorical": true + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + } + } + } + } + } + } + } + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + }, + "enriched_path": [ + "new-instruction", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..aa43d9bc9361fcfb1363caf7b317d1025ee41679 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..2b936b5bbc8c4d2cc898e784b4adc61745c01d61 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json @@ -0,0 +1,50 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-instruction.value.pii", + "data_schema": { + "fields": { + "new-instruction": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "pii": { + "fields": { + "emails": { + "repeated_field": { + "dtype": "string_span" + } + }, + "ip_addresses": { + "repeated_field": { + "dtype": "string_span" + } + }, + "secrets": { + "repeated_field": { + "dtype": "string_span" + } + } + }, + "signal": { + "signal_name": "pii" + } + } + } + } + } + } + } + } + }, + "signal": { + "signal_name": "pii" + }, + "enriched_path": [ + "new-instruction", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06daa94984714d8b0e7c637e7193662928719c19 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..9e1c321f5b2c6fbfab34d1f480571f886b2cf2b1 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json @@ -0,0 +1,64 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-instruction.value.text_statistics", + "data_schema": { + "fields": { + "new-instruction": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "text_statistics": { + "fields": { + "num_characters": { + "dtype": "int32" + }, + "readability": { + "dtype": "float32" + }, + "log(type_token_ratio)": { + "dtype": "float32" + }, + "frac_non_ascii": { + "dtype": "float32", + "bins": [ + [ + "Low", + null, + 0.15 + ], + [ + "Medium", + 0.15, + 0.3 + ], + [ + "High", + 0.3, + null + ] + ] + } + }, + "signal": { + "signal_name": "text_statistics" + } + } + } + } + } + } + } + } + }, + "signal": { + "signal_name": "text_statistics" + }, + "enriched_path": [ + "new-instruction", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dd30adaa74359b66cff48ceb4597f98c8a525ff5 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..c30b358c1ab83d9b1b8cd521105fcbeb6fccca1e --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json @@ -0,0 +1,36 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-response.value.lang_detection", + "data_schema": { + "fields": { + "new-response": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "lang_detection": { + "dtype": "string", + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + } + } + } + } + } + } + } + } + }, + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + }, + "enriched_path": [ + "new-response", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7d6f738e4631123dec03fdd84f8886d2dd6c44b6 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..46fd22477b7513f89995377f6864357d49b3c387 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json @@ -0,0 +1,41 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-response.value.near_dup", + "data_schema": { + "fields": { + "new-response": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "near_dup": { + "fields": { + "cluster_id": { + "dtype": "uint32", + "categorical": true + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + } + } + } + } + } + } + } + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + }, + "enriched_path": [ + "new-response", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b14b5408c676a1c9a45b72f9c2558ef818f09ecb Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..b2707f21da26dc46ee307e2868e92761ce463d42 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json @@ -0,0 +1,50 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-response.value.pii", + "data_schema": { + "fields": { + "new-response": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "pii": { + "fields": { + "emails": { + "repeated_field": { + "dtype": "string_span" + } + }, + "ip_addresses": { + "repeated_field": { + "dtype": "string_span" + } + }, + "secrets": { + "repeated_field": { + "dtype": "string_span" + } + } + }, + "signal": { + "signal_name": "pii" + } + } + } + } + } + } + } + } + }, + "signal": { + "signal_name": "pii" + }, + "enriched_path": [ + "new-response", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a6d001b61474e85e717f3e3e67db47d3b9ac351a Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..7baeaf6b8e07107fa7e01151777ebb12ca900b6c --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json @@ -0,0 +1,64 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "new-response.value.text_statistics", + "data_schema": { + "fields": { + "new-response": { + "fields": { + "value": { + "repeated_field": { + "fields": { + "text_statistics": { + "fields": { + "num_characters": { + "dtype": "int32" + }, + "readability": { + "dtype": "float32" + }, + "log(type_token_ratio)": { + "dtype": "float32" + }, + "frac_non_ascii": { + "dtype": "float32", + "bins": [ + [ + "Low", + null, + 0.15 + ], + [ + "Medium", + 0.15, + 0.3 + ], + [ + "High", + 0.3, + null + ] + ] + } + }, + "signal": { + "signal_name": "text_statistics" + } + } + } + } + } + } + } + } + }, + "signal": { + "signal_name": "text_statistics" + }, + "enriched_path": [ + "new-response", + "value", + "*" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc446fd51f76b2f961b0ec85b28bca1eef6f125d --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.hnswlib.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee5c4ca43663633f531a587438913cc15fecad5baed5fdce2a1c7bc97a6e9260 +size 32775684 diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.lookup.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bcbdad167671f5007fef128fab53786838425745 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/hnsw.lookup.pkl differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..925b587acba7a5baf9596f4ec0bbd5a6232b230c --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/signal_manifest.json @@ -0,0 +1,32 @@ +{ + "files": [], + "parquet_id": "original-context.gte-small", + "data_schema": { + "fields": { + "original-context": { + "fields": { + "gte-small": { + "repeated_field": { + "fields": { + "embedding": { + "dtype": "embedding" + } + }, + "dtype": "string_span" + }, + "signal": { + "signal_name": "gte-small" + } + } + } + } + } + }, + "signal": { + "signal_name": "gte-small" + }, + "enriched_path": [ + "original-context" + ], + "vector_store": "hnsw" +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/spans.pkl b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/spans.pkl new file mode 100644 index 0000000000000000000000000000000000000000..38d06d879b2db4bae5ab3fbfbfd21442a977abfc Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/gte-small/spans.pkl differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5939f2159b06190f1c1dcf3eeb9bc527b28a0f29 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..da8924b91e8cf100d33257dcb7359c12eb8201f9 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json @@ -0,0 +1,28 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-context.lang_detection", + "data_schema": { + "fields": { + "original-context": { + "fields": { + "lang_detection": { + "dtype": "string", + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + } + } + } + } + } + }, + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + }, + "enriched_path": [ + "original-context" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e1bae69a076649cb6e353f50e83728b3bd66cc76 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..3d6ad15530590e56a58b4174545b1c02d9e87f16 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json @@ -0,0 +1,33 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-context.near_dup", + "data_schema": { + "fields": { + "original-context": { + "fields": { + "near_dup": { + "fields": { + "cluster_id": { + "dtype": "uint32", + "categorical": true + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + } + } + } + } + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + }, + "enriched_path": [ + "original-context" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..97b1eae81f685872087d9ff0f1268b6ae24696e3 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..1c8c86b08156615bdcbb1b01b027e867896abf47 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json @@ -0,0 +1,42 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-context.pii", + "data_schema": { + "fields": { + "original-context": { + "fields": { + "pii": { + "fields": { + "emails": { + "repeated_field": { + "dtype": "string_span" + } + }, + "ip_addresses": { + "repeated_field": { + "dtype": "string_span" + } + }, + "secrets": { + "repeated_field": { + "dtype": "string_span" + } + } + }, + "signal": { + "signal_name": "pii" + } + } + } + } + } + }, + "signal": { + "signal_name": "pii" + }, + "enriched_path": [ + "original-context" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4bd457460b79956cdb8b0a30df236cec6da45ca9 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..c219f72073ea506cc102e7be5b0bf18e014e8e37 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json @@ -0,0 +1,56 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-context.text_statistics", + "data_schema": { + "fields": { + "original-context": { + "fields": { + "text_statistics": { + "fields": { + "num_characters": { + "dtype": "int32" + }, + "readability": { + "dtype": "float32" + }, + "log(type_token_ratio)": { + "dtype": "float32" + }, + "frac_non_ascii": { + "dtype": "float32", + "bins": [ + [ + "Low", + null, + 0.15 + ], + [ + "Medium", + 0.15, + 0.3 + ], + [ + "High", + 0.3, + null + ] + ] + } + }, + "signal": { + "signal_name": "text_statistics" + } + } + } + } + } + }, + "signal": { + "signal_name": "text_statistics" + }, + "enriched_path": [ + "original-context" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..810d08c58f34a358392e919be1c4fac8cedab492 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..5403ea73713ca19b6d9695d84d708a0717951f3d --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json @@ -0,0 +1,28 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-instruction.lang_detection", + "data_schema": { + "fields": { + "original-instruction": { + "fields": { + "lang_detection": { + "dtype": "string", + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + } + } + } + } + } + }, + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + }, + "enriched_path": [ + "original-instruction" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7883ab9e315bfdd7388c9ae8b84c5322cc94e323 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..3fc2b8e2488c7829bed0da6873b766efff7c063c --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json @@ -0,0 +1,33 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-instruction.near_dup", + "data_schema": { + "fields": { + "original-instruction": { + "fields": { + "near_dup": { + "fields": { + "cluster_id": { + "dtype": "uint32", + "categorical": true + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + } + } + } + } + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + }, + "enriched_path": [ + "original-instruction" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8363079b3420a3523700dd807f5d2cdbd4699caf Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..dd64ca3e3586fde559d800f71d42652d70b8b2c7 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json @@ -0,0 +1,42 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-instruction.pii", + "data_schema": { + "fields": { + "original-instruction": { + "fields": { + "pii": { + "fields": { + "emails": { + "repeated_field": { + "dtype": "string_span" + } + }, + "ip_addresses": { + "repeated_field": { + "dtype": "string_span" + } + }, + "secrets": { + "repeated_field": { + "dtype": "string_span" + } + } + }, + "signal": { + "signal_name": "pii" + } + } + } + } + } + }, + "signal": { + "signal_name": "pii" + }, + "enriched_path": [ + "original-instruction" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b6bd8cd14b7f30f26f3ba6aa150bf0f5261423a4 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..20205ab39a7aabb320e8527ab5d07d6f11ca437e --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/signal_manifest.json @@ -0,0 +1,56 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-instruction.text_statistics", + "data_schema": { + "fields": { + "original-instruction": { + "fields": { + "text_statistics": { + "fields": { + "num_characters": { + "dtype": "int32" + }, + "readability": { + "dtype": "float32" + }, + "log(type_token_ratio)": { + "dtype": "float32" + }, + "frac_non_ascii": { + "dtype": "float32", + "bins": [ + [ + "Low", + null, + 0.15 + ], + [ + "Medium", + 0.15, + 0.3 + ], + [ + "High", + 0.3, + null + ] + ] + } + }, + "signal": { + "signal_name": "text_statistics" + } + } + } + } + } + }, + "signal": { + "signal_name": "text_statistics" + }, + "enriched_path": [ + "original-instruction" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5e707ad570708ed3b7e663bd8b846680a819315e Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..77708983524bc2566292b0a9c4bddc858e9a8827 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/signal_manifest.json @@ -0,0 +1,28 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-response.lang_detection", + "data_schema": { + "fields": { + "original-response": { + "fields": { + "lang_detection": { + "dtype": "string", + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + } + } + } + } + } + }, + "signal": { + "split_by_paragraph": false, + "signal_name": "lang_detection" + }, + "enriched_path": [ + "original-response" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6eaa55fbbc21749eb43cd41f2e09df2f68860c1d Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..1cc681b7f142ca25d3a13d4ce07269b3378c8269 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/signal_manifest.json @@ -0,0 +1,33 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-response.near_dup", + "data_schema": { + "fields": { + "original-response": { + "fields": { + "near_dup": { + "fields": { + "cluster_id": { + "dtype": "uint32", + "categorical": true + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + } + } + } + } + } + }, + "signal": { + "threshold": 0.85, + "signal_name": "near_dup" + }, + "enriched_path": [ + "original-response" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0a68c98404049718414e64dda90c0d7e8df35c15 Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..9e3fd2284477b88b191df29e67c3c8234ac8bd54 --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/signal_manifest.json @@ -0,0 +1,42 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-response.pii", + "data_schema": { + "fields": { + "original-response": { + "fields": { + "pii": { + "fields": { + "emails": { + "repeated_field": { + "dtype": "string_span" + } + }, + "ip_addresses": { + "repeated_field": { + "dtype": "string_span" + } + }, + "secrets": { + "repeated_field": { + "dtype": "string_span" + } + } + }, + "signal": { + "signal_name": "pii" + } + } + } + } + } + }, + "signal": { + "signal_name": "pii" + }, + "enriched_path": [ + "original-response" + ] +} \ No newline at end of file diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/text_statistics/data-00000-of-00001.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c98e8763b5974843818f90576d1b05f7d3a1cb7e Binary files /dev/null and b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/text_statistics/data-00000-of-00001.parquet differ diff --git a/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/text_statistics/signal_manifest.json b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/text_statistics/signal_manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..a34980ab39dd12332c28e815d0ecc96a7d37507a --- /dev/null +++ b/data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/text_statistics/signal_manifest.json @@ -0,0 +1,56 @@ +{ + "files": [ + "data-00000-of-00001.parquet" + ], + "parquet_id": "original-response.text_statistics", + "data_schema": { + "fields": { + "original-response": { + "fields": { + "text_statistics": { + "fields": { + "num_characters": { + "dtype": "int32" + }, + "readability": { + "dtype": "float32" + }, + "log(type_token_ratio)": { + "dtype": "float32" + }, + "frac_non_ascii": { + "dtype": "float32", + "bins": [ + [ + "Low", + null, + 0.15 + ], + [ + "Medium", + 0.15, + 0.3 + ], + [ + "High", + 0.3, + null + ] + ] + } + }, + "signal": { + "signal_name": "text_statistics" + } + } + } + } + } + }, + "signal": { + "signal_name": "text_statistics" + }, + "enriched_path": [ + "original-response" + ] +} \ No newline at end of file