lilac / data /lilac.yml
nsthorat-lilac's picture
Push to HF space
77bf495
raw
history blame
45.1 kB
datasets:
- namespace: lilac
name: imdb
tags: []
source:
dataset_name: imdb
config_name: null
split: null
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
threshold: 0.85
signal_name: near_dup
- path: text
signal:
signal_name: pii
- path: text
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: text
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- text
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: open-asssistant-conversations
tags: []
source:
dataset_name: OpenAssistant/oasst1
config_name: null
split: null
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
threshold: 0.85
signal_name: near_dup
- path: text
signal:
signal_name: pii
- path: text
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: text
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- text
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: wikitext-2-raw-v1
tags: []
source:
dataset_name: wikitext
config_name: wikitext-2-raw-v1
split: null
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
threshold: 0.85
signal_name: near_dup
- path: text
signal:
signal_name: pii
- path: text
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: text
signal:
signal_name: text_statistics
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
settings:
ui:
media_paths:
- text
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: databricks-dolly-15k-curated-en
tags: []
source:
dataset_name: argilla/databricks-dolly-15k-curated-en
config_name: null
split: null
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: original-context
embedding: gte-small
- path:
- new-context
- value
- '*'
embedding: gte-small
signals:
- path: original-instruction
signal:
threshold: 0.85
signal_name: near_dup
- path: original-instruction
signal:
signal_name: pii
- path: original-instruction
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: original-instruction
signal:
signal_name: text_statistics
- path: original-context
signal:
threshold: 0.85
signal_name: near_dup
- path: original-context
signal:
signal_name: pii
- path: original-context
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: original-context
signal:
signal_name: text_statistics
- path: original-response
signal:
threshold: 0.85
signal_name: near_dup
- path: original-response
signal:
signal_name: pii
- path: original-response
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: original-response
signal:
signal_name: text_statistics
- path:
- new-instruction
- value
- '*'
signal:
threshold: 0.85
signal_name: near_dup
- path:
- new-instruction
- value
- '*'
signal:
signal_name: pii
- path:
- new-instruction
- value
- '*'
signal:
split_by_paragraph: false
signal_name: lang_detection
- path:
- new-instruction
- value
- '*'
signal:
signal_name: text_statistics
- path:
- new-context
- value
- '*'
signal:
threshold: 0.85
signal_name: near_dup
- path:
- new-context
- value
- '*'
signal:
signal_name: pii
- path:
- new-context
- value
- '*'
signal:
split_by_paragraph: false
signal_name: lang_detection
- path:
- new-context
- value
- '*'
signal:
signal_name: text_statistics
- path:
- new-response
- value
- '*'
signal:
threshold: 0.85
signal_name: near_dup
- path:
- new-response
- value
- '*'
signal:
signal_name: pii
- path:
- new-response
- value
- '*'
signal:
split_by_paragraph: false
signal_name: lang_detection
- path:
- new-response
- value
- '*'
signal:
signal_name: text_statistics
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: original-instruction
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: original-response
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path:
- new-instruction
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path:
- new-response
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
settings:
ui:
media_paths:
- original-instruction
- original-context
- original-response
- - new-instruction
- value
- '*'
- - new-context
- value
- '*'
- - new-response
- value
- '*'
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: piqa
tags: []
source:
dataset_name: piqa
config_name: null
split: null
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: goal
embedding: gte-small
- path: sol1
embedding: gte-small
- path: sol2
embedding: gte-small
signals:
- path: goal
signal:
threshold: 0.85
signal_name: near_dup
- path: goal
signal:
signal_name: pii
- path: goal
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: goal
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: goal
signal:
signal_name: text_statistics
- path: sol1
signal:
threshold: 0.85
signal_name: near_dup
- path: sol1
signal:
signal_name: pii
- path: sol1
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: sol1
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: sol1
signal:
signal_name: text_statistics
- path: sol2
signal:
threshold: 0.85
signal_name: near_dup
- path: sol2
signal:
signal_name: pii
- path: sol2
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: sol2
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: sol2
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- sol1
- sol2
- goal
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: OpenOrca-100k
tags: []
source:
dataset_name: Open-Orca/OpenOrca
config_name: null
split: null
sample_size: 100000
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: question
embedding: gte-small
- path: response
embedding: gte-small
signals:
- path: question
signal:
threshold: 0.85
signal_name: near_dup
- path: question
signal:
signal_name: pii
- path: question
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: question
signal:
signal_name: text_statistics
- path: response
signal:
threshold: 0.85
signal_name: near_dup
- path: response
signal:
signal_name: pii
- path: response
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: response
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- question
- response
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: opus100-en-es-validation
tags: []
source:
dataset_name: opus100
config_name: en-es
split: validation
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path:
- translation
- en
embedding: gte-small
- path:
- translation
- es
embedding: gte-small
signals:
- path:
- translation
- en
signal:
threshold: 0.85
signal_name: near_dup
- path:
- translation
- en
signal:
signal_name: pii
- path:
- translation
- en
signal:
split_by_paragraph: false
signal_name: lang_detection
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path:
- translation
- en
signal:
signal_name: text_statistics
- path:
- translation
- es
signal:
threshold: 0.85
signal_name: near_dup
- path:
- translation
- es
signal:
signal_name: pii
- path:
- translation
- es
signal:
split_by_paragraph: false
signal_name: lang_detection
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path:
- translation
- es
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- - translation
- es
- - translation
- en
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: science-qa-derek-thomas
tags: []
source:
dataset_name: derek-thomas/ScienceQA
config_name: null
split: null
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: lecture
embedding: gte-small
signals:
- path: lecture
signal:
threshold: 0.85
signal_name: near_dup
- path: lecture
signal:
signal_name: pii
- path: lecture
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: lecture
signal:
signal_name: text_statistics
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: lecture
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
settings:
ui:
media_paths:
- lecture
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: enron-emails
tags: []
source:
dataset_name: EleutherAI/pile
config_name: enron_emails
split: null
sample_size: 100000
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
threshold: 0.85
signal_name: near_dup
- path: text
signal:
signal_name: pii
- path: text
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: text
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- text
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: the_movies_dataset
tags: []
source:
filepaths:
- https://storage.googleapis.com/lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv
delim: ','
header: true
names: null
source_name: csv
embeddings:
- path: overview
embedding: gte-small
signals:
- path: overview
signal:
threshold: 0.85
signal_name: near_dup
- path: overview
signal:
signal_name: pii
- path: overview
signal:
split_by_paragraph: false
signal_name: lang_detection
- path: overview
signal:
signal_name: text_statistics
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
draft: main
signal_name: concept_score
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
draft: main
signal_name: concept_score
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
draft: main
signal_name: concept_score
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
draft: main
signal_name: concept_score
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
draft: main
signal_name: concept_score
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: question
draft: main
signal_name: concept_score
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
draft: main
signal_name: concept_score
- path: overview
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
draft: main
signal_name: concept_score
settings:
ui:
media_paths:
- overview
markdown_paths: []
preferred_embedding: gte-small
- namespace: lilac
name: textbook_quality_programming
tags: []
source:
dataset_name: vikp/textbook_quality_programming
config_name: null
split: null
sample_size: null
revision: null
load_from_disk: false
source_name: huggingface
embeddings:
- path:
- outline
- '*'
embedding: gte-small
- path:
- concepts
- '*'
embedding: gte-small
- path: markdown
embedding: gte-small
signals: []
settings:
ui:
media_paths:
- - outline
- '*'
- - concepts
- '*'
- markdown
markdown_paths:
- markdown
preferred_embedding: gte-small
signals: []
concept_model_cache_embeddings: []