nsthorat-lilac commited on
Commit
af895fd
·
1 Parent(s): 8b83893

Upload data/lilac.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. data/lilac.yml +335 -3
data/lilac.yml CHANGED
@@ -1,11 +1,14 @@
1
- # Lilac project config.
2
- # See https://lilacml.com/api_reference/index.html#lilac.Config for details.
3
-
4
  datasets:
5
  - namespace: lilac
6
  name: imdb
 
7
  source:
8
  dataset_name: imdb
 
 
 
 
 
9
  source_name: huggingface
10
  embeddings:
11
  - path: text
@@ -13,60 +16,70 @@ datasets:
13
  signals:
14
  - path: text
15
  signal:
 
16
  signal_name: near_dup
17
  - path: text
18
  signal:
19
  signal_name: pii
20
  - path: text
21
  signal:
 
22
  signal_name: lang_detection
23
  - path: text
24
  signal:
25
  embedding: gte-small
26
  namespace: lilac
27
  concept_name: positive-sentiment
 
28
  signal_name: concept_score
29
  - path: text
30
  signal:
31
  embedding: gte-small
32
  namespace: lilac
33
  concept_name: non-english
 
34
  signal_name: concept_score
35
  - path: text
36
  signal:
37
  embedding: gte-small
38
  namespace: lilac
39
  concept_name: toxicity
 
40
  signal_name: concept_score
41
  - path: text
42
  signal:
43
  embedding: gte-small
44
  namespace: lilac
45
  concept_name: question
 
46
  signal_name: concept_score
47
  - path: text
48
  signal:
49
  embedding: gte-small
50
  namespace: lilac
51
  concept_name: legal-termination
 
52
  signal_name: concept_score
53
  - path: text
54
  signal:
55
  embedding: gte-small
56
  namespace: lilac
57
  concept_name: source-code
 
58
  signal_name: concept_score
59
  - path: text
60
  signal:
61
  embedding: gte-small
62
  namespace: lilac
63
  concept_name: negative-sentiment
 
64
  signal_name: concept_score
65
  - path: text
66
  signal:
67
  embedding: gte-small
68
  namespace: lilac
69
  concept_name: profanity
 
70
  signal_name: concept_score
71
  - path: text
72
  signal:
@@ -75,11 +88,18 @@ datasets:
75
  ui:
76
  media_paths:
77
  - text
 
78
  preferred_embedding: gte-small
79
  - namespace: lilac
80
  name: open-asssistant-conversations
 
81
  source:
82
  dataset_name: OpenAssistant/oasst1
 
 
 
 
 
83
  source_name: huggingface
84
  embeddings:
85
  - path: text
@@ -87,66 +107,77 @@ datasets:
87
  signals:
88
  - path: text
89
  signal:
 
90
  signal_name: near_dup
91
  - path: text
92
  signal:
93
  signal_name: pii
94
  - path: text
95
  signal:
 
96
  signal_name: lang_detection
97
  - path: text
98
  signal:
99
  embedding: gte-small
100
  namespace: lilac
101
  concept_name: positive-sentiment
 
102
  signal_name: concept_score
103
  - path: text
104
  signal:
105
  embedding: gte-small
106
  namespace: lilac
107
  concept_name: non-english
 
108
  signal_name: concept_score
109
  - path: text
110
  signal:
111
  embedding: gte-small
112
  namespace: lilac
113
  concept_name: toxicity
 
114
  signal_name: concept_score
115
  - path: text
116
  signal:
117
  embedding: gte-small
118
  namespace: lilac
119
  concept_name: question
 
120
  signal_name: concept_score
121
  - path: text
122
  signal:
123
  embedding: gte-small
124
  namespace: lilac
125
  concept_name: legal-termination
 
126
  signal_name: concept_score
127
  - path: text
128
  signal:
129
  embedding: gte-small
130
  namespace: lilac
131
  concept_name: source-code
 
132
  signal_name: concept_score
133
  - path: text
134
  signal:
135
  embedding: gte-small
136
  namespace: lilac
137
  concept_name: negative-sentiment
 
138
  signal_name: concept_score
139
  - path: text
140
  signal:
141
  embedding: gte-small
142
  namespace: lilac
143
  concept_name: negative-sentiment
 
144
  signal_name: concept_score
145
  - path: text
146
  signal:
147
  embedding: gte-small
148
  namespace: lilac
149
  concept_name: profanity
 
150
  signal_name: concept_score
151
  - path: text
152
  signal:
@@ -155,12 +186,18 @@ datasets:
155
  ui:
156
  media_paths:
157
  - text
 
158
  preferred_embedding: gte-small
159
  - namespace: lilac
160
  name: wikitext-2-raw-v1
 
161
  source:
162
  dataset_name: wikitext
163
  config_name: wikitext-2-raw-v1
 
 
 
 
164
  source_name: huggingface
165
  embeddings:
166
  - path: text
@@ -168,12 +205,14 @@ datasets:
168
  signals:
169
  - path: text
170
  signal:
 
171
  signal_name: near_dup
172
  - path: text
173
  signal:
174
  signal_name: pii
175
  - path: text
176
  signal:
 
177
  signal_name: lang_detection
178
  - path: text
179
  signal:
@@ -183,58 +222,73 @@ datasets:
183
  embedding: gte-small
184
  namespace: lilac
185
  concept_name: legal-termination
 
186
  signal_name: concept_score
187
  - path: text
188
  signal:
189
  embedding: gte-small
190
  namespace: lilac
191
  concept_name: negative-sentiment
 
192
  signal_name: concept_score
193
  - path: text
194
  signal:
195
  embedding: gte-small
196
  namespace: lilac
197
  concept_name: non-english
 
198
  signal_name: concept_score
199
  - path: text
200
  signal:
201
  embedding: gte-small
202
  namespace: lilac
203
  concept_name: positive-sentiment
 
204
  signal_name: concept_score
205
  - path: text
206
  signal:
207
  embedding: gte-small
208
  namespace: lilac
209
  concept_name: profanity
 
210
  signal_name: concept_score
211
  - path: text
212
  signal:
213
  embedding: gte-small
214
  namespace: lilac
215
  concept_name: question
 
216
  signal_name: concept_score
217
  - path: text
218
  signal:
219
  embedding: gte-small
220
  namespace: lilac
221
  concept_name: source-code
 
222
  signal_name: concept_score
223
  - path: text
224
  signal:
225
  embedding: gte-small
226
  namespace: lilac
227
  concept_name: toxicity
 
228
  signal_name: concept_score
229
  settings:
230
  ui:
231
  media_paths:
232
  - text
 
233
  preferred_embedding: gte-small
234
  - namespace: lilac
235
  name: squad_v2
 
236
  source:
237
  dataset_name: squad_v2
 
 
 
 
 
238
  source_name: huggingface
239
  embeddings:
240
  - path: context
@@ -242,72 +296,84 @@ datasets:
242
  signals:
243
  - path: context
244
  signal:
 
245
  signal_name: near_dup
246
  - path: context
247
  signal:
248
  signal_name: pii
249
  - path: context
250
  signal:
 
251
  signal_name: lang_detection
252
  - path: context
253
  signal:
254
  embedding: gte-small
255
  namespace: lilac
256
  concept_name: positive-sentiment
 
257
  signal_name: concept_score
258
  - path: context
259
  signal:
260
  embedding: gte-small
261
  namespace: lilac
262
  concept_name: non-english
 
263
  signal_name: concept_score
264
  - path: context
265
  signal:
266
  embedding: gte-small
267
  namespace: lilac
268
  concept_name: toxicity
 
269
  signal_name: concept_score
270
  - path: context
271
  signal:
272
  embedding: gte-small
273
  namespace: lilac
274
  concept_name: question
 
275
  signal_name: concept_score
276
  - path: context
277
  signal:
278
  embedding: gte-small
279
  namespace: lilac
280
  concept_name: legal-termination
 
281
  signal_name: concept_score
282
  - path: context
283
  signal:
284
  embedding: gte-small
285
  namespace: lilac
286
  concept_name: source-code
 
287
  signal_name: concept_score
288
  - path: context
289
  signal:
290
  embedding: gte-small
291
  namespace: lilac
292
  concept_name: negative-sentiment
 
293
  signal_name: concept_score
294
  - path: context
295
  signal:
296
  embedding: gte-small
297
  namespace: lilac
298
  concept_name: profanity
 
299
  signal_name: concept_score
300
  - path: context
301
  signal:
302
  signal_name: text_statistics
303
  - path: question
304
  signal:
 
305
  signal_name: near_dup
306
  - path: question
307
  signal:
308
  signal_name: pii
309
  - path: question
310
  signal:
 
311
  signal_name: lang_detection
312
  - path: question
313
  signal:
@@ -317,6 +383,7 @@ datasets:
317
  - text
318
  - '*'
319
  signal:
 
320
  signal_name: near_dup
321
  - path:
322
  - answers
@@ -329,6 +396,7 @@ datasets:
329
  - text
330
  - '*'
331
  signal:
 
332
  signal_name: lang_detection
333
  - path:
334
  - answers
@@ -341,48 +409,56 @@ datasets:
341
  embedding: gte-small
342
  namespace: lilac
343
  concept_name: legal-termination
 
344
  signal_name: concept_score
345
  - path: question
346
  signal:
347
  embedding: gte-small
348
  namespace: lilac
349
  concept_name: negative-sentiment
 
350
  signal_name: concept_score
351
  - path: question
352
  signal:
353
  embedding: gte-small
354
  namespace: lilac
355
  concept_name: non-english
 
356
  signal_name: concept_score
357
  - path: question
358
  signal:
359
  embedding: gte-small
360
  namespace: lilac
361
  concept_name: positive-sentiment
 
362
  signal_name: concept_score
363
  - path: question
364
  signal:
365
  embedding: gte-small
366
  namespace: lilac
367
  concept_name: profanity
 
368
  signal_name: concept_score
369
  - path: question
370
  signal:
371
  embedding: gte-small
372
  namespace: lilac
373
  concept_name: question
 
374
  signal_name: concept_score
375
  - path: question
376
  signal:
377
  embedding: gte-small
378
  namespace: lilac
379
  concept_name: source-code
 
380
  signal_name: concept_score
381
  - path: question
382
  signal:
383
  embedding: gte-small
384
  namespace: lilac
385
  concept_name: toxicity
 
386
  signal_name: concept_score
387
  - path:
388
  - answers
@@ -392,6 +468,7 @@ datasets:
392
  embedding: gte-small
393
  namespace: lilac
394
  concept_name: legal-termination
 
395
  signal_name: concept_score
396
  - path:
397
  - answers
@@ -401,6 +478,7 @@ datasets:
401
  embedding: gte-small
402
  namespace: lilac
403
  concept_name: negative-sentiment
 
404
  signal_name: concept_score
405
  - path:
406
  - answers
@@ -410,6 +488,7 @@ datasets:
410
  embedding: gte-small
411
  namespace: lilac
412
  concept_name: non-english
 
413
  signal_name: concept_score
414
  - path:
415
  - answers
@@ -419,6 +498,7 @@ datasets:
419
  embedding: gte-small
420
  namespace: lilac
421
  concept_name: positive-sentiment
 
422
  signal_name: concept_score
423
  - path:
424
  - answers
@@ -428,6 +508,7 @@ datasets:
428
  embedding: gte-small
429
  namespace: lilac
430
  concept_name: profanity
 
431
  signal_name: concept_score
432
  - path:
433
  - answers
@@ -437,6 +518,7 @@ datasets:
437
  embedding: gte-small
438
  namespace: lilac
439
  concept_name: question
 
440
  signal_name: concept_score
441
  - path:
442
  - answers
@@ -446,6 +528,7 @@ datasets:
446
  embedding: gte-small
447
  namespace: lilac
448
  concept_name: source-code
 
449
  signal_name: concept_score
450
  - path:
451
  - answers
@@ -455,6 +538,7 @@ datasets:
455
  embedding: gte-small
456
  namespace: lilac
457
  concept_name: toxicity
 
458
  signal_name: concept_score
459
  settings:
460
  ui:
@@ -464,11 +548,18 @@ datasets:
464
  - - answers
465
  - text
466
  - '*'
 
467
  preferred_embedding: gte-small
468
  - namespace: lilac
469
  name: databricks-dolly-15k-curated-en
 
470
  source:
471
  dataset_name: argilla/databricks-dolly-15k-curated-en
 
 
 
 
 
472
  source_name: huggingface
473
  embeddings:
474
  - path: original-context
@@ -481,36 +572,42 @@ datasets:
481
  signals:
482
  - path: original-instruction
483
  signal:
 
484
  signal_name: near_dup
485
  - path: original-instruction
486
  signal:
487
  signal_name: pii
488
  - path: original-instruction
489
  signal:
 
490
  signal_name: lang_detection
491
  - path: original-instruction
492
  signal:
493
  signal_name: text_statistics
494
  - path: original-context
495
  signal:
 
496
  signal_name: near_dup
497
  - path: original-context
498
  signal:
499
  signal_name: pii
500
  - path: original-context
501
  signal:
 
502
  signal_name: lang_detection
503
  - path: original-context
504
  signal:
505
  signal_name: text_statistics
506
  - path: original-response
507
  signal:
 
508
  signal_name: near_dup
509
  - path: original-response
510
  signal:
511
  signal_name: pii
512
  - path: original-response
513
  signal:
 
514
  signal_name: lang_detection
515
  - path: original-response
516
  signal:
@@ -520,6 +617,7 @@ datasets:
520
  - value
521
  - '*'
522
  signal:
 
523
  signal_name: near_dup
524
  - path:
525
  - new-instruction
@@ -532,6 +630,7 @@ datasets:
532
  - value
533
  - '*'
534
  signal:
 
535
  signal_name: lang_detection
536
  - path:
537
  - new-instruction
@@ -544,6 +643,7 @@ datasets:
544
  - value
545
  - '*'
546
  signal:
 
547
  signal_name: near_dup
548
  - path:
549
  - new-context
@@ -556,6 +656,7 @@ datasets:
556
  - value
557
  - '*'
558
  signal:
 
559
  signal_name: lang_detection
560
  - path:
561
  - new-context
@@ -568,6 +669,7 @@ datasets:
568
  - value
569
  - '*'
570
  signal:
 
571
  signal_name: near_dup
572
  - path:
573
  - new-response
@@ -580,6 +682,7 @@ datasets:
580
  - value
581
  - '*'
582
  signal:
 
583
  signal_name: lang_detection
584
  - path:
585
  - new-response
@@ -592,144 +695,168 @@ datasets:
592
  embedding: gte-small
593
  namespace: lilac
594
  concept_name: legal-termination
 
595
  signal_name: concept_score
596
  - path: original-instruction
597
  signal:
598
  embedding: gte-small
599
  namespace: lilac
600
  concept_name: negative-sentiment
 
601
  signal_name: concept_score
602
  - path: original-instruction
603
  signal:
604
  embedding: gte-small
605
  namespace: lilac
606
  concept_name: non-english
 
607
  signal_name: concept_score
608
  - path: original-instruction
609
  signal:
610
  embedding: gte-small
611
  namespace: lilac
612
  concept_name: positive-sentiment
 
613
  signal_name: concept_score
614
  - path: original-instruction
615
  signal:
616
  embedding: gte-small
617
  namespace: lilac
618
  concept_name: profanity
 
619
  signal_name: concept_score
620
  - path: original-instruction
621
  signal:
622
  embedding: gte-small
623
  namespace: lilac
624
  concept_name: question
 
625
  signal_name: concept_score
626
  - path: original-instruction
627
  signal:
628
  embedding: gte-small
629
  namespace: lilac
630
  concept_name: source-code
 
631
  signal_name: concept_score
632
  - path: original-instruction
633
  signal:
634
  embedding: gte-small
635
  namespace: lilac
636
  concept_name: toxicity
 
637
  signal_name: concept_score
638
  - path: original-context
639
  signal:
640
  embedding: gte-small
641
  namespace: lilac
642
  concept_name: legal-termination
 
643
  signal_name: concept_score
644
  - path: original-context
645
  signal:
646
  embedding: gte-small
647
  namespace: lilac
648
  concept_name: negative-sentiment
 
649
  signal_name: concept_score
650
  - path: original-context
651
  signal:
652
  embedding: gte-small
653
  namespace: lilac
654
  concept_name: non-english
 
655
  signal_name: concept_score
656
  - path: original-context
657
  signal:
658
  embedding: gte-small
659
  namespace: lilac
660
  concept_name: positive-sentiment
 
661
  signal_name: concept_score
662
  - path: original-context
663
  signal:
664
  embedding: gte-small
665
  namespace: lilac
666
  concept_name: profanity
 
667
  signal_name: concept_score
668
  - path: original-context
669
  signal:
670
  embedding: gte-small
671
  namespace: lilac
672
  concept_name: question
 
673
  signal_name: concept_score
674
  - path: original-context
675
  signal:
676
  embedding: gte-small
677
  namespace: lilac
678
  concept_name: source-code
 
679
  signal_name: concept_score
680
  - path: original-context
681
  signal:
682
  embedding: gte-small
683
  namespace: lilac
684
  concept_name: toxicity
 
685
  signal_name: concept_score
686
  - path: original-response
687
  signal:
688
  embedding: gte-small
689
  namespace: lilac
690
  concept_name: legal-termination
 
691
  signal_name: concept_score
692
  - path: original-response
693
  signal:
694
  embedding: gte-small
695
  namespace: lilac
696
  concept_name: negative-sentiment
 
697
  signal_name: concept_score
698
  - path: original-response
699
  signal:
700
  embedding: gte-small
701
  namespace: lilac
702
  concept_name: non-english
 
703
  signal_name: concept_score
704
  - path: original-response
705
  signal:
706
  embedding: gte-small
707
  namespace: lilac
708
  concept_name: positive-sentiment
 
709
  signal_name: concept_score
710
  - path: original-response
711
  signal:
712
  embedding: gte-small
713
  namespace: lilac
714
  concept_name: profanity
 
715
  signal_name: concept_score
716
  - path: original-response
717
  signal:
718
  embedding: gte-small
719
  namespace: lilac
720
  concept_name: question
 
721
  signal_name: concept_score
722
  - path: original-response
723
  signal:
724
  embedding: gte-small
725
  namespace: lilac
726
  concept_name: source-code
 
727
  signal_name: concept_score
728
  - path: original-response
729
  signal:
730
  embedding: gte-small
731
  namespace: lilac
732
  concept_name: toxicity
 
733
  signal_name: concept_score
734
  - path:
735
  - new-instruction
@@ -739,6 +866,7 @@ datasets:
739
  embedding: gte-small
740
  namespace: lilac
741
  concept_name: legal-termination
 
742
  signal_name: concept_score
743
  - path:
744
  - new-instruction
@@ -748,6 +876,7 @@ datasets:
748
  embedding: gte-small
749
  namespace: lilac
750
  concept_name: negative-sentiment
 
751
  signal_name: concept_score
752
  - path:
753
  - new-instruction
@@ -757,6 +886,7 @@ datasets:
757
  embedding: gte-small
758
  namespace: lilac
759
  concept_name: non-english
 
760
  signal_name: concept_score
761
  - path:
762
  - new-instruction
@@ -766,6 +896,7 @@ datasets:
766
  embedding: gte-small
767
  namespace: lilac
768
  concept_name: positive-sentiment
 
769
  signal_name: concept_score
770
  - path:
771
  - new-instruction
@@ -775,6 +906,7 @@ datasets:
775
  embedding: gte-small
776
  namespace: lilac
777
  concept_name: profanity
 
778
  signal_name: concept_score
779
  - path:
780
  - new-instruction
@@ -784,6 +916,7 @@ datasets:
784
  embedding: gte-small
785
  namespace: lilac
786
  concept_name: question
 
787
  signal_name: concept_score
788
  - path:
789
  - new-instruction
@@ -793,6 +926,7 @@ datasets:
793
  embedding: gte-small
794
  namespace: lilac
795
  concept_name: source-code
 
796
  signal_name: concept_score
797
  - path:
798
  - new-instruction
@@ -802,6 +936,7 @@ datasets:
802
  embedding: gte-small
803
  namespace: lilac
804
  concept_name: toxicity
 
805
  signal_name: concept_score
806
  - path:
807
  - new-context
@@ -811,6 +946,7 @@ datasets:
811
  embedding: gte-small
812
  namespace: lilac
813
  concept_name: legal-termination
 
814
  signal_name: concept_score
815
  - path:
816
  - new-context
@@ -820,6 +956,7 @@ datasets:
820
  embedding: gte-small
821
  namespace: lilac
822
  concept_name: negative-sentiment
 
823
  signal_name: concept_score
824
  - path:
825
  - new-context
@@ -829,6 +966,7 @@ datasets:
829
  embedding: gte-small
830
  namespace: lilac
831
  concept_name: non-english
 
832
  signal_name: concept_score
833
  - path:
834
  - new-context
@@ -838,6 +976,7 @@ datasets:
838
  embedding: gte-small
839
  namespace: lilac
840
  concept_name: positive-sentiment
 
841
  signal_name: concept_score
842
  - path:
843
  - new-context
@@ -847,6 +986,7 @@ datasets:
847
  embedding: gte-small
848
  namespace: lilac
849
  concept_name: profanity
 
850
  signal_name: concept_score
851
  - path:
852
  - new-context
@@ -856,6 +996,7 @@ datasets:
856
  embedding: gte-small
857
  namespace: lilac
858
  concept_name: question
 
859
  signal_name: concept_score
860
  - path:
861
  - new-context
@@ -865,6 +1006,7 @@ datasets:
865
  embedding: gte-small
866
  namespace: lilac
867
  concept_name: source-code
 
868
  signal_name: concept_score
869
  - path:
870
  - new-context
@@ -874,6 +1016,7 @@ datasets:
874
  embedding: gte-small
875
  namespace: lilac
876
  concept_name: toxicity
 
877
  signal_name: concept_score
878
  - path:
879
  - new-response
@@ -883,6 +1026,7 @@ datasets:
883
  embedding: gte-small
884
  namespace: lilac
885
  concept_name: legal-termination
 
886
  signal_name: concept_score
887
  - path:
888
  - new-response
@@ -892,6 +1036,7 @@ datasets:
892
  embedding: gte-small
893
  namespace: lilac
894
  concept_name: negative-sentiment
 
895
  signal_name: concept_score
896
  - path:
897
  - new-response
@@ -901,6 +1046,7 @@ datasets:
901
  embedding: gte-small
902
  namespace: lilac
903
  concept_name: non-english
 
904
  signal_name: concept_score
905
  - path:
906
  - new-response
@@ -910,6 +1056,7 @@ datasets:
910
  embedding: gte-small
911
  namespace: lilac
912
  concept_name: positive-sentiment
 
913
  signal_name: concept_score
914
  - path:
915
  - new-response
@@ -919,6 +1066,7 @@ datasets:
919
  embedding: gte-small
920
  namespace: lilac
921
  concept_name: profanity
 
922
  signal_name: concept_score
923
  - path:
924
  - new-response
@@ -928,6 +1076,7 @@ datasets:
928
  embedding: gte-small
929
  namespace: lilac
930
  concept_name: question
 
931
  signal_name: concept_score
932
  - path:
933
  - new-response
@@ -937,6 +1086,7 @@ datasets:
937
  embedding: gte-small
938
  namespace: lilac
939
  concept_name: source-code
 
940
  signal_name: concept_score
941
  - path:
942
  - new-response
@@ -946,6 +1096,7 @@ datasets:
946
  embedding: gte-small
947
  namespace: lilac
948
  concept_name: toxicity
 
949
  signal_name: concept_score
950
  settings:
951
  ui:
@@ -962,11 +1113,18 @@ datasets:
962
  - - new-response
963
  - value
964
  - '*'
 
965
  preferred_embedding: gte-small
966
  - namespace: lilac
967
  name: piqa
 
968
  source:
969
  dataset_name: piqa
 
 
 
 
 
970
  source_name: huggingface
971
  embeddings:
972
  - path: goal
@@ -978,180 +1136,210 @@ datasets:
978
  signals:
979
  - path: goal
980
  signal:
 
981
  signal_name: near_dup
982
  - path: goal
983
  signal:
984
  signal_name: pii
985
  - path: goal
986
  signal:
 
987
  signal_name: lang_detection
988
  - path: goal
989
  signal:
990
  embedding: gte-small
991
  namespace: lilac
992
  concept_name: positive-sentiment
 
993
  signal_name: concept_score
994
  - path: goal
995
  signal:
996
  embedding: gte-small
997
  namespace: lilac
998
  concept_name: non-english
 
999
  signal_name: concept_score
1000
  - path: goal
1001
  signal:
1002
  embedding: gte-small
1003
  namespace: lilac
1004
  concept_name: toxicity
 
1005
  signal_name: concept_score
1006
  - path: goal
1007
  signal:
1008
  embedding: gte-small
1009
  namespace: lilac
1010
  concept_name: question
 
1011
  signal_name: concept_score
1012
  - path: goal
1013
  signal:
1014
  embedding: gte-small
1015
  namespace: lilac
1016
  concept_name: legal-termination
 
1017
  signal_name: concept_score
1018
  - path: goal
1019
  signal:
1020
  embedding: gte-small
1021
  namespace: lilac
1022
  concept_name: source-code
 
1023
  signal_name: concept_score
1024
  - path: goal
1025
  signal:
1026
  embedding: gte-small
1027
  namespace: lilac
1028
  concept_name: negative-sentiment
 
1029
  signal_name: concept_score
1030
  - path: goal
1031
  signal:
1032
  embedding: gte-small
1033
  namespace: lilac
1034
  concept_name: profanity
 
1035
  signal_name: concept_score
1036
  - path: goal
1037
  signal:
1038
  signal_name: text_statistics
1039
  - path: sol1
1040
  signal:
 
1041
  signal_name: near_dup
1042
  - path: sol1
1043
  signal:
1044
  signal_name: pii
1045
  - path: sol1
1046
  signal:
 
1047
  signal_name: lang_detection
1048
  - path: sol1
1049
  signal:
1050
  embedding: gte-small
1051
  namespace: lilac
1052
  concept_name: positive-sentiment
 
1053
  signal_name: concept_score
1054
  - path: sol1
1055
  signal:
1056
  embedding: gte-small
1057
  namespace: lilac
1058
  concept_name: non-english
 
1059
  signal_name: concept_score
1060
  - path: sol1
1061
  signal:
1062
  embedding: gte-small
1063
  namespace: lilac
1064
  concept_name: toxicity
 
1065
  signal_name: concept_score
1066
  - path: sol1
1067
  signal:
1068
  embedding: gte-small
1069
  namespace: lilac
1070
  concept_name: question
 
1071
  signal_name: concept_score
1072
  - path: sol1
1073
  signal:
1074
  embedding: gte-small
1075
  namespace: lilac
1076
  concept_name: legal-termination
 
1077
  signal_name: concept_score
1078
  - path: sol1
1079
  signal:
1080
  embedding: gte-small
1081
  namespace: lilac
1082
  concept_name: source-code
 
1083
  signal_name: concept_score
1084
  - path: sol1
1085
  signal:
1086
  embedding: gte-small
1087
  namespace: lilac
1088
  concept_name: negative-sentiment
 
1089
  signal_name: concept_score
1090
  - path: sol1
1091
  signal:
1092
  embedding: gte-small
1093
  namespace: lilac
1094
  concept_name: profanity
 
1095
  signal_name: concept_score
1096
  - path: sol1
1097
  signal:
1098
  signal_name: text_statistics
1099
  - path: sol2
1100
  signal:
 
1101
  signal_name: near_dup
1102
  - path: sol2
1103
  signal:
1104
  signal_name: pii
1105
  - path: sol2
1106
  signal:
 
1107
  signal_name: lang_detection
1108
  - path: sol2
1109
  signal:
1110
  embedding: gte-small
1111
  namespace: lilac
1112
  concept_name: positive-sentiment
 
1113
  signal_name: concept_score
1114
  - path: sol2
1115
  signal:
1116
  embedding: gte-small
1117
  namespace: lilac
1118
  concept_name: non-english
 
1119
  signal_name: concept_score
1120
  - path: sol2
1121
  signal:
1122
  embedding: gte-small
1123
  namespace: lilac
1124
  concept_name: toxicity
 
1125
  signal_name: concept_score
1126
  - path: sol2
1127
  signal:
1128
  embedding: gte-small
1129
  namespace: lilac
1130
  concept_name: question
 
1131
  signal_name: concept_score
1132
  - path: sol2
1133
  signal:
1134
  embedding: gte-small
1135
  namespace: lilac
1136
  concept_name: legal-termination
 
1137
  signal_name: concept_score
1138
  - path: sol2
1139
  signal:
1140
  embedding: gte-small
1141
  namespace: lilac
1142
  concept_name: source-code
 
1143
  signal_name: concept_score
1144
  - path: sol2
1145
  signal:
1146
  embedding: gte-small
1147
  namespace: lilac
1148
  concept_name: negative-sentiment
 
1149
  signal_name: concept_score
1150
  - path: sol2
1151
  signal:
1152
  embedding: gte-small
1153
  namespace: lilac
1154
  concept_name: profanity
 
1155
  signal_name: concept_score
1156
  - path: sol2
1157
  signal:
@@ -1162,12 +1350,18 @@ datasets:
1162
  - sol1
1163
  - sol2
1164
  - goal
 
1165
  preferred_embedding: gte-small
1166
  - namespace: lilac
1167
  name: OpenOrca-100k
 
1168
  source:
1169
  dataset_name: Open-Orca/OpenOrca
 
 
1170
  sample_size: 100000
 
 
1171
  source_name: huggingface
1172
  embeddings:
1173
  - path: question
@@ -1177,120 +1371,140 @@ datasets:
1177
  signals:
1178
  - path: question
1179
  signal:
 
1180
  signal_name: near_dup
1181
  - path: question
1182
  signal:
1183
  signal_name: pii
1184
  - path: question
1185
  signal:
 
1186
  signal_name: lang_detection
1187
  - path: question
1188
  signal:
1189
  embedding: gte-small
1190
  namespace: lilac
1191
  concept_name: positive-sentiment
 
1192
  signal_name: concept_score
1193
  - path: question
1194
  signal:
1195
  embedding: gte-small
1196
  namespace: lilac
1197
  concept_name: non-english
 
1198
  signal_name: concept_score
1199
  - path: question
1200
  signal:
1201
  embedding: gte-small
1202
  namespace: lilac
1203
  concept_name: toxicity
 
1204
  signal_name: concept_score
1205
  - path: question
1206
  signal:
1207
  embedding: gte-small
1208
  namespace: lilac
1209
  concept_name: question
 
1210
  signal_name: concept_score
1211
  - path: question
1212
  signal:
1213
  embedding: gte-small
1214
  namespace: lilac
1215
  concept_name: legal-termination
 
1216
  signal_name: concept_score
1217
  - path: question
1218
  signal:
1219
  embedding: gte-small
1220
  namespace: lilac
1221
  concept_name: source-code
 
1222
  signal_name: concept_score
1223
  - path: question
1224
  signal:
1225
  embedding: gte-small
1226
  namespace: lilac
1227
  concept_name: negative-sentiment
 
1228
  signal_name: concept_score
1229
  - path: question
1230
  signal:
1231
  embedding: gte-small
1232
  namespace: lilac
1233
  concept_name: profanity
 
1234
  signal_name: concept_score
1235
  - path: question
1236
  signal:
1237
  signal_name: text_statistics
1238
  - path: response
1239
  signal:
 
1240
  signal_name: near_dup
1241
  - path: response
1242
  signal:
1243
  signal_name: pii
1244
  - path: response
1245
  signal:
 
1246
  signal_name: lang_detection
1247
  - path: response
1248
  signal:
1249
  embedding: gte-small
1250
  namespace: lilac
1251
  concept_name: positive-sentiment
 
1252
  signal_name: concept_score
1253
  - path: response
1254
  signal:
1255
  embedding: gte-small
1256
  namespace: lilac
1257
  concept_name: non-english
 
1258
  signal_name: concept_score
1259
  - path: response
1260
  signal:
1261
  embedding: gte-small
1262
  namespace: lilac
1263
  concept_name: toxicity
 
1264
  signal_name: concept_score
1265
  - path: response
1266
  signal:
1267
  embedding: gte-small
1268
  namespace: lilac
1269
  concept_name: question
 
1270
  signal_name: concept_score
1271
  - path: response
1272
  signal:
1273
  embedding: gte-small
1274
  namespace: lilac
1275
  concept_name: legal-termination
 
1276
  signal_name: concept_score
1277
  - path: response
1278
  signal:
1279
  embedding: gte-small
1280
  namespace: lilac
1281
  concept_name: source-code
 
1282
  signal_name: concept_score
1283
  - path: response
1284
  signal:
1285
  embedding: gte-small
1286
  namespace: lilac
1287
  concept_name: negative-sentiment
 
1288
  signal_name: concept_score
1289
  - path: response
1290
  signal:
1291
  embedding: gte-small
1292
  namespace: lilac
1293
  concept_name: profanity
 
1294
  signal_name: concept_score
1295
  - path: response
1296
  signal:
@@ -1300,13 +1514,18 @@ datasets:
1300
  media_paths:
1301
  - question
1302
  - response
 
1303
  preferred_embedding: gte-small
1304
  - namespace: lilac
1305
  name: opus100-en-es-validation
 
1306
  source:
1307
  dataset_name: opus100
1308
  config_name: en-es
1309
  split: validation
 
 
 
1310
  source_name: huggingface
1311
  embeddings:
1312
  - path:
@@ -1322,6 +1541,7 @@ datasets:
1322
  - translation
1323
  - en
1324
  signal:
 
1325
  signal_name: near_dup
1326
  - path:
1327
  - translation
@@ -1332,6 +1552,7 @@ datasets:
1332
  - translation
1333
  - en
1334
  signal:
 
1335
  signal_name: lang_detection
1336
  - path:
1337
  - translation
@@ -1340,6 +1561,7 @@ datasets:
1340
  embedding: gte-small
1341
  namespace: lilac
1342
  concept_name: positive-sentiment
 
1343
  signal_name: concept_score
1344
  - path:
1345
  - translation
@@ -1348,6 +1570,7 @@ datasets:
1348
  embedding: gte-small
1349
  namespace: lilac
1350
  concept_name: non-english
 
1351
  signal_name: concept_score
1352
  - path:
1353
  - translation
@@ -1356,6 +1579,7 @@ datasets:
1356
  embedding: gte-small
1357
  namespace: lilac
1358
  concept_name: toxicity
 
1359
  signal_name: concept_score
1360
  - path:
1361
  - translation
@@ -1364,6 +1588,7 @@ datasets:
1364
  embedding: gte-small
1365
  namespace: lilac
1366
  concept_name: question
 
1367
  signal_name: concept_score
1368
  - path:
1369
  - translation
@@ -1372,6 +1597,7 @@ datasets:
1372
  embedding: gte-small
1373
  namespace: lilac
1374
  concept_name: legal-termination
 
1375
  signal_name: concept_score
1376
  - path:
1377
  - translation
@@ -1380,6 +1606,7 @@ datasets:
1380
  embedding: gte-small
1381
  namespace: lilac
1382
  concept_name: source-code
 
1383
  signal_name: concept_score
1384
  - path:
1385
  - translation
@@ -1388,6 +1615,7 @@ datasets:
1388
  embedding: gte-small
1389
  namespace: lilac
1390
  concept_name: negative-sentiment
 
1391
  signal_name: concept_score
1392
  - path:
1393
  - translation
@@ -1396,6 +1624,7 @@ datasets:
1396
  embedding: gte-small
1397
  namespace: lilac
1398
  concept_name: profanity
 
1399
  signal_name: concept_score
1400
  - path:
1401
  - translation
@@ -1406,6 +1635,7 @@ datasets:
1406
  - translation
1407
  - es
1408
  signal:
 
1409
  signal_name: near_dup
1410
  - path:
1411
  - translation
@@ -1416,6 +1646,7 @@ datasets:
1416
  - translation
1417
  - es
1418
  signal:
 
1419
  signal_name: lang_detection
1420
  - path:
1421
  - translation
@@ -1424,6 +1655,7 @@ datasets:
1424
  embedding: gte-small
1425
  namespace: lilac
1426
  concept_name: positive-sentiment
 
1427
  signal_name: concept_score
1428
  - path:
1429
  - translation
@@ -1432,6 +1664,7 @@ datasets:
1432
  embedding: gte-small
1433
  namespace: lilac
1434
  concept_name: non-english
 
1435
  signal_name: concept_score
1436
  - path:
1437
  - translation
@@ -1440,6 +1673,7 @@ datasets:
1440
  embedding: gte-small
1441
  namespace: lilac
1442
  concept_name: toxicity
 
1443
  signal_name: concept_score
1444
  - path:
1445
  - translation
@@ -1448,6 +1682,7 @@ datasets:
1448
  embedding: gte-small
1449
  namespace: lilac
1450
  concept_name: question
 
1451
  signal_name: concept_score
1452
  - path:
1453
  - translation
@@ -1456,6 +1691,7 @@ datasets:
1456
  embedding: gte-small
1457
  namespace: lilac
1458
  concept_name: legal-termination
 
1459
  signal_name: concept_score
1460
  - path:
1461
  - translation
@@ -1464,6 +1700,7 @@ datasets:
1464
  embedding: gte-small
1465
  namespace: lilac
1466
  concept_name: source-code
 
1467
  signal_name: concept_score
1468
  - path:
1469
  - translation
@@ -1472,6 +1709,7 @@ datasets:
1472
  embedding: gte-small
1473
  namespace: lilac
1474
  concept_name: negative-sentiment
 
1475
  signal_name: concept_score
1476
  - path:
1477
  - translation
@@ -1480,6 +1718,7 @@ datasets:
1480
  embedding: gte-small
1481
  namespace: lilac
1482
  concept_name: profanity
 
1483
  signal_name: concept_score
1484
  - path:
1485
  - translation
@@ -1493,12 +1732,18 @@ datasets:
1493
  - es
1494
  - - translation
1495
  - en
 
1496
  preferred_embedding: gte-small
1497
  - namespace: lilac
1498
  name: mmlu_professional_law
 
1499
  source:
1500
  dataset_name: cais/mmlu
1501
  config_name: professional_law
 
 
 
 
1502
  source_name: huggingface
1503
  embeddings:
1504
  - path: question
@@ -1510,60 +1755,70 @@ datasets:
1510
  signals:
1511
  - path: question
1512
  signal:
 
1513
  signal_name: near_dup
1514
  - path: question
1515
  signal:
1516
  signal_name: pii
1517
  - path: question
1518
  signal:
 
1519
  signal_name: lang_detection
1520
  - path: question
1521
  signal:
1522
  embedding: gte-small
1523
  namespace: lilac
1524
  concept_name: positive-sentiment
 
1525
  signal_name: concept_score
1526
  - path: question
1527
  signal:
1528
  embedding: gte-small
1529
  namespace: lilac
1530
  concept_name: non-english
 
1531
  signal_name: concept_score
1532
  - path: question
1533
  signal:
1534
  embedding: gte-small
1535
  namespace: lilac
1536
  concept_name: toxicity
 
1537
  signal_name: concept_score
1538
  - path: question
1539
  signal:
1540
  embedding: gte-small
1541
  namespace: lilac
1542
  concept_name: question
 
1543
  signal_name: concept_score
1544
  - path: question
1545
  signal:
1546
  embedding: gte-small
1547
  namespace: lilac
1548
  concept_name: legal-termination
 
1549
  signal_name: concept_score
1550
  - path: question
1551
  signal:
1552
  embedding: gte-small
1553
  namespace: lilac
1554
  concept_name: source-code
 
1555
  signal_name: concept_score
1556
  - path: question
1557
  signal:
1558
  embedding: gte-small
1559
  namespace: lilac
1560
  concept_name: negative-sentiment
 
1561
  signal_name: concept_score
1562
  - path: question
1563
  signal:
1564
  embedding: gte-small
1565
  namespace: lilac
1566
  concept_name: profanity
 
1567
  signal_name: concept_score
1568
  - path: question
1569
  signal:
@@ -1572,6 +1827,7 @@ datasets:
1572
  - choices
1573
  - '*'
1574
  signal:
 
1575
  signal_name: near_dup
1576
  - path:
1577
  - choices
@@ -1582,6 +1838,7 @@ datasets:
1582
  - choices
1583
  - '*'
1584
  signal:
 
1585
  signal_name: lang_detection
1586
  - path:
1587
  - choices
@@ -1590,6 +1847,7 @@ datasets:
1590
  embedding: gte-small
1591
  namespace: lilac
1592
  concept_name: positive-sentiment
 
1593
  signal_name: concept_score
1594
  - path:
1595
  - choices
@@ -1598,6 +1856,7 @@ datasets:
1598
  embedding: gte-small
1599
  namespace: lilac
1600
  concept_name: non-english
 
1601
  signal_name: concept_score
1602
  - path:
1603
  - choices
@@ -1606,6 +1865,7 @@ datasets:
1606
  embedding: gte-small
1607
  namespace: lilac
1608
  concept_name: toxicity
 
1609
  signal_name: concept_score
1610
  - path:
1611
  - choices
@@ -1614,6 +1874,7 @@ datasets:
1614
  embedding: gte-small
1615
  namespace: lilac
1616
  concept_name: question
 
1617
  signal_name: concept_score
1618
  - path:
1619
  - choices
@@ -1622,6 +1883,7 @@ datasets:
1622
  embedding: gte-small
1623
  namespace: lilac
1624
  concept_name: legal-termination
 
1625
  signal_name: concept_score
1626
  - path:
1627
  - choices
@@ -1630,6 +1892,7 @@ datasets:
1630
  embedding: gte-small
1631
  namespace: lilac
1632
  concept_name: source-code
 
1633
  signal_name: concept_score
1634
  - path:
1635
  - choices
@@ -1638,6 +1901,7 @@ datasets:
1638
  embedding: gte-small
1639
  namespace: lilac
1640
  concept_name: negative-sentiment
 
1641
  signal_name: concept_score
1642
  - path:
1643
  - choices
@@ -1646,6 +1910,7 @@ datasets:
1646
  embedding: gte-small
1647
  namespace: lilac
1648
  concept_name: negative-sentiment
 
1649
  signal_name: concept_score
1650
  - path:
1651
  - choices
@@ -1654,6 +1919,7 @@ datasets:
1654
  embedding: gte-small
1655
  namespace: lilac
1656
  concept_name: profanity
 
1657
  signal_name: concept_score
1658
  - path:
1659
  - choices
@@ -1666,12 +1932,18 @@ datasets:
1666
  - question
1667
  - - choices
1668
  - '*'
 
1669
  preferred_embedding: gte-small
1670
  - namespace: lilac
1671
  name: pile-of-law-r-legaladvice
 
1672
  source:
1673
  dataset_name: pile-of-law/pile-of-law
1674
  config_name: r_legaladvice
 
 
 
 
1675
  source_name: huggingface
1676
  embeddings:
1677
  - path: text
@@ -1679,60 +1951,70 @@ datasets:
1679
  signals:
1680
  - path: text
1681
  signal:
 
1682
  signal_name: near_dup
1683
  - path: text
1684
  signal:
1685
  signal_name: pii
1686
  - path: text
1687
  signal:
 
1688
  signal_name: lang_detection
1689
  - path: text
1690
  signal:
1691
  embedding: gte-small
1692
  namespace: lilac
1693
  concept_name: positive-sentiment
 
1694
  signal_name: concept_score
1695
  - path: text
1696
  signal:
1697
  embedding: gte-small
1698
  namespace: lilac
1699
  concept_name: non-english
 
1700
  signal_name: concept_score
1701
  - path: text
1702
  signal:
1703
  embedding: gte-small
1704
  namespace: lilac
1705
  concept_name: toxicity
 
1706
  signal_name: concept_score
1707
  - path: text
1708
  signal:
1709
  embedding: gte-small
1710
  namespace: lilac
1711
  concept_name: question
 
1712
  signal_name: concept_score
1713
  - path: text
1714
  signal:
1715
  embedding: gte-small
1716
  namespace: lilac
1717
  concept_name: legal-termination
 
1718
  signal_name: concept_score
1719
  - path: text
1720
  signal:
1721
  embedding: gte-small
1722
  namespace: lilac
1723
  concept_name: source-code
 
1724
  signal_name: concept_score
1725
  - path: text
1726
  signal:
1727
  embedding: gte-small
1728
  namespace: lilac
1729
  concept_name: negative-sentiment
 
1730
  signal_name: concept_score
1731
  - path: text
1732
  signal:
1733
  embedding: gte-small
1734
  namespace: lilac
1735
  concept_name: profanity
 
1736
  signal_name: concept_score
1737
  - path: text
1738
  signal:
@@ -1741,11 +2023,18 @@ datasets:
1741
  ui:
1742
  media_paths:
1743
  - text
 
1744
  preferred_embedding: gte-small
1745
  - namespace: lilac
1746
  name: science-qa-derek-thomas
 
1747
  source:
1748
  dataset_name: derek-thomas/ScienceQA
 
 
 
 
 
1749
  source_name: huggingface
1750
  embeddings:
1751
  - path: lecture
@@ -1753,12 +2042,14 @@ datasets:
1753
  signals:
1754
  - path: lecture
1755
  signal:
 
1756
  signal_name: near_dup
1757
  - path: lecture
1758
  signal:
1759
  signal_name: pii
1760
  - path: lecture
1761
  signal:
 
1762
  signal_name: lang_detection
1763
  - path: lecture
1764
  signal:
@@ -1768,60 +2059,73 @@ datasets:
1768
  embedding: gte-small
1769
  namespace: lilac
1770
  concept_name: legal-termination
 
1771
  signal_name: concept_score
1772
  - path: lecture
1773
  signal:
1774
  embedding: gte-small
1775
  namespace: lilac
1776
  concept_name: negative-sentiment
 
1777
  signal_name: concept_score
1778
  - path: lecture
1779
  signal:
1780
  embedding: gte-small
1781
  namespace: lilac
1782
  concept_name: non-english
 
1783
  signal_name: concept_score
1784
  - path: lecture
1785
  signal:
1786
  embedding: gte-small
1787
  namespace: lilac
1788
  concept_name: positive-sentiment
 
1789
  signal_name: concept_score
1790
  - path: lecture
1791
  signal:
1792
  embedding: gte-small
1793
  namespace: lilac
1794
  concept_name: profanity
 
1795
  signal_name: concept_score
1796
  - path: lecture
1797
  signal:
1798
  embedding: gte-small
1799
  namespace: lilac
1800
  concept_name: question
 
1801
  signal_name: concept_score
1802
  - path: lecture
1803
  signal:
1804
  embedding: gte-small
1805
  namespace: lilac
1806
  concept_name: source-code
 
1807
  signal_name: concept_score
1808
  - path: lecture
1809
  signal:
1810
  embedding: gte-small
1811
  namespace: lilac
1812
  concept_name: toxicity
 
1813
  signal_name: concept_score
1814
  settings:
1815
  ui:
1816
  media_paths:
1817
  - lecture
 
1818
  preferred_embedding: gte-small
1819
  - namespace: lilac
1820
  name: enron-emails
 
1821
  source:
1822
  dataset_name: EleutherAI/pile
1823
  config_name: enron_emails
 
1824
  sample_size: 100000
 
 
1825
  source_name: huggingface
1826
  embeddings:
1827
  - path: text
@@ -1829,60 +2133,70 @@ datasets:
1829
  signals:
1830
  - path: text
1831
  signal:
 
1832
  signal_name: near_dup
1833
  - path: text
1834
  signal:
1835
  signal_name: pii
1836
  - path: text
1837
  signal:
 
1838
  signal_name: lang_detection
1839
  - path: text
1840
  signal:
1841
  embedding: gte-small
1842
  namespace: lilac
1843
  concept_name: positive-sentiment
 
1844
  signal_name: concept_score
1845
  - path: text
1846
  signal:
1847
  embedding: gte-small
1848
  namespace: lilac
1849
  concept_name: non-english
 
1850
  signal_name: concept_score
1851
  - path: text
1852
  signal:
1853
  embedding: gte-small
1854
  namespace: lilac
1855
  concept_name: toxicity
 
1856
  signal_name: concept_score
1857
  - path: text
1858
  signal:
1859
  embedding: gte-small
1860
  namespace: lilac
1861
  concept_name: question
 
1862
  signal_name: concept_score
1863
  - path: text
1864
  signal:
1865
  embedding: gte-small
1866
  namespace: lilac
1867
  concept_name: legal-termination
 
1868
  signal_name: concept_score
1869
  - path: text
1870
  signal:
1871
  embedding: gte-small
1872
  namespace: lilac
1873
  concept_name: source-code
 
1874
  signal_name: concept_score
1875
  - path: text
1876
  signal:
1877
  embedding: gte-small
1878
  namespace: lilac
1879
  concept_name: negative-sentiment
 
1880
  signal_name: concept_score
1881
  - path: text
1882
  signal:
1883
  embedding: gte-small
1884
  namespace: lilac
1885
  concept_name: profanity
 
1886
  signal_name: concept_score
1887
  - path: text
1888
  signal:
@@ -1891,12 +2205,17 @@ datasets:
1891
  ui:
1892
  media_paths:
1893
  - text
 
1894
  preferred_embedding: gte-small
1895
  - namespace: lilac
1896
  name: the_movies_dataset
 
1897
  source:
1898
  filepaths:
1899
  - https://storage.googleapis.com/lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv
 
 
 
1900
  source_name: csv
1901
  embeddings:
1902
  - path: overview
@@ -1904,12 +2223,14 @@ datasets:
1904
  signals:
1905
  - path: overview
1906
  signal:
 
1907
  signal_name: near_dup
1908
  - path: overview
1909
  signal:
1910
  signal_name: pii
1911
  - path: overview
1912
  signal:
 
1913
  signal_name: lang_detection
1914
  - path: overview
1915
  signal:
@@ -1919,51 +2240,62 @@ datasets:
1919
  embedding: gte-small
1920
  namespace: lilac
1921
  concept_name: legal-termination
 
1922
  signal_name: concept_score
1923
  - path: overview
1924
  signal:
1925
  embedding: gte-small
1926
  namespace: lilac
1927
  concept_name: negative-sentiment
 
1928
  signal_name: concept_score
1929
  - path: overview
1930
  signal:
1931
  embedding: gte-small
1932
  namespace: lilac
1933
  concept_name: non-english
 
1934
  signal_name: concept_score
1935
  - path: overview
1936
  signal:
1937
  embedding: gte-small
1938
  namespace: lilac
1939
  concept_name: positive-sentiment
 
1940
  signal_name: concept_score
1941
  - path: overview
1942
  signal:
1943
  embedding: gte-small
1944
  namespace: lilac
1945
  concept_name: profanity
 
1946
  signal_name: concept_score
1947
  - path: overview
1948
  signal:
1949
  embedding: gte-small
1950
  namespace: lilac
1951
  concept_name: question
 
1952
  signal_name: concept_score
1953
  - path: overview
1954
  signal:
1955
  embedding: gte-small
1956
  namespace: lilac
1957
  concept_name: source-code
 
1958
  signal_name: concept_score
1959
  - path: overview
1960
  signal:
1961
  embedding: gte-small
1962
  namespace: lilac
1963
  concept_name: toxicity
 
1964
  signal_name: concept_score
1965
  settings:
1966
  ui:
1967
  media_paths:
1968
  - overview
 
1969
  preferred_embedding: gte-small
 
 
 
 
 
 
1
  datasets:
2
  - namespace: lilac
3
  name: imdb
4
+ tags: []
5
  source:
6
  dataset_name: imdb
7
+ config_name: null
8
+ split: null
9
+ sample_size: null
10
+ revision: null
11
+ load_from_disk: false
12
  source_name: huggingface
13
  embeddings:
14
  - path: text
 
16
  signals:
17
  - path: text
18
  signal:
19
+ threshold: 0.85
20
  signal_name: near_dup
21
  - path: text
22
  signal:
23
  signal_name: pii
24
  - path: text
25
  signal:
26
+ split_by_paragraph: false
27
  signal_name: lang_detection
28
  - path: text
29
  signal:
30
  embedding: gte-small
31
  namespace: lilac
32
  concept_name: positive-sentiment
33
+ draft: main
34
  signal_name: concept_score
35
  - path: text
36
  signal:
37
  embedding: gte-small
38
  namespace: lilac
39
  concept_name: non-english
40
+ draft: main
41
  signal_name: concept_score
42
  - path: text
43
  signal:
44
  embedding: gte-small
45
  namespace: lilac
46
  concept_name: toxicity
47
+ draft: main
48
  signal_name: concept_score
49
  - path: text
50
  signal:
51
  embedding: gte-small
52
  namespace: lilac
53
  concept_name: question
54
+ draft: main
55
  signal_name: concept_score
56
  - path: text
57
  signal:
58
  embedding: gte-small
59
  namespace: lilac
60
  concept_name: legal-termination
61
+ draft: main
62
  signal_name: concept_score
63
  - path: text
64
  signal:
65
  embedding: gte-small
66
  namespace: lilac
67
  concept_name: source-code
68
+ draft: main
69
  signal_name: concept_score
70
  - path: text
71
  signal:
72
  embedding: gte-small
73
  namespace: lilac
74
  concept_name: negative-sentiment
75
+ draft: main
76
  signal_name: concept_score
77
  - path: text
78
  signal:
79
  embedding: gte-small
80
  namespace: lilac
81
  concept_name: profanity
82
+ draft: main
83
  signal_name: concept_score
84
  - path: text
85
  signal:
 
88
  ui:
89
  media_paths:
90
  - text
91
+ markdown_paths: []
92
  preferred_embedding: gte-small
93
  - namespace: lilac
94
  name: open-asssistant-conversations
95
+ tags: []
96
  source:
97
  dataset_name: OpenAssistant/oasst1
98
+ config_name: null
99
+ split: null
100
+ sample_size: null
101
+ revision: null
102
+ load_from_disk: false
103
  source_name: huggingface
104
  embeddings:
105
  - path: text
 
107
  signals:
108
  - path: text
109
  signal:
110
+ threshold: 0.85
111
  signal_name: near_dup
112
  - path: text
113
  signal:
114
  signal_name: pii
115
  - path: text
116
  signal:
117
+ split_by_paragraph: false
118
  signal_name: lang_detection
119
  - path: text
120
  signal:
121
  embedding: gte-small
122
  namespace: lilac
123
  concept_name: positive-sentiment
124
+ draft: main
125
  signal_name: concept_score
126
  - path: text
127
  signal:
128
  embedding: gte-small
129
  namespace: lilac
130
  concept_name: non-english
131
+ draft: main
132
  signal_name: concept_score
133
  - path: text
134
  signal:
135
  embedding: gte-small
136
  namespace: lilac
137
  concept_name: toxicity
138
+ draft: main
139
  signal_name: concept_score
140
  - path: text
141
  signal:
142
  embedding: gte-small
143
  namespace: lilac
144
  concept_name: question
145
+ draft: main
146
  signal_name: concept_score
147
  - path: text
148
  signal:
149
  embedding: gte-small
150
  namespace: lilac
151
  concept_name: legal-termination
152
+ draft: main
153
  signal_name: concept_score
154
  - path: text
155
  signal:
156
  embedding: gte-small
157
  namespace: lilac
158
  concept_name: source-code
159
+ draft: main
160
  signal_name: concept_score
161
  - path: text
162
  signal:
163
  embedding: gte-small
164
  namespace: lilac
165
  concept_name: negative-sentiment
166
+ draft: main
167
  signal_name: concept_score
168
  - path: text
169
  signal:
170
  embedding: gte-small
171
  namespace: lilac
172
  concept_name: negative-sentiment
173
+ draft: main
174
  signal_name: concept_score
175
  - path: text
176
  signal:
177
  embedding: gte-small
178
  namespace: lilac
179
  concept_name: profanity
180
+ draft: main
181
  signal_name: concept_score
182
  - path: text
183
  signal:
 
186
  ui:
187
  media_paths:
188
  - text
189
+ markdown_paths: []
190
  preferred_embedding: gte-small
191
  - namespace: lilac
192
  name: wikitext-2-raw-v1
193
+ tags: []
194
  source:
195
  dataset_name: wikitext
196
  config_name: wikitext-2-raw-v1
197
+ split: null
198
+ sample_size: null
199
+ revision: null
200
+ load_from_disk: false
201
  source_name: huggingface
202
  embeddings:
203
  - path: text
 
205
  signals:
206
  - path: text
207
  signal:
208
+ threshold: 0.85
209
  signal_name: near_dup
210
  - path: text
211
  signal:
212
  signal_name: pii
213
  - path: text
214
  signal:
215
+ split_by_paragraph: false
216
  signal_name: lang_detection
217
  - path: text
218
  signal:
 
222
  embedding: gte-small
223
  namespace: lilac
224
  concept_name: legal-termination
225
+ draft: main
226
  signal_name: concept_score
227
  - path: text
228
  signal:
229
  embedding: gte-small
230
  namespace: lilac
231
  concept_name: negative-sentiment
232
+ draft: main
233
  signal_name: concept_score
234
  - path: text
235
  signal:
236
  embedding: gte-small
237
  namespace: lilac
238
  concept_name: non-english
239
+ draft: main
240
  signal_name: concept_score
241
  - path: text
242
  signal:
243
  embedding: gte-small
244
  namespace: lilac
245
  concept_name: positive-sentiment
246
+ draft: main
247
  signal_name: concept_score
248
  - path: text
249
  signal:
250
  embedding: gte-small
251
  namespace: lilac
252
  concept_name: profanity
253
+ draft: main
254
  signal_name: concept_score
255
  - path: text
256
  signal:
257
  embedding: gte-small
258
  namespace: lilac
259
  concept_name: question
260
+ draft: main
261
  signal_name: concept_score
262
  - path: text
263
  signal:
264
  embedding: gte-small
265
  namespace: lilac
266
  concept_name: source-code
267
+ draft: main
268
  signal_name: concept_score
269
  - path: text
270
  signal:
271
  embedding: gte-small
272
  namespace: lilac
273
  concept_name: toxicity
274
+ draft: main
275
  signal_name: concept_score
276
  settings:
277
  ui:
278
  media_paths:
279
  - text
280
+ markdown_paths: []
281
  preferred_embedding: gte-small
282
  - namespace: lilac
283
  name: squad_v2
284
+ tags: []
285
  source:
286
  dataset_name: squad_v2
287
+ config_name: null
288
+ split: null
289
+ sample_size: null
290
+ revision: null
291
+ load_from_disk: false
292
  source_name: huggingface
293
  embeddings:
294
  - path: context
 
296
  signals:
297
  - path: context
298
  signal:
299
+ threshold: 0.85
300
  signal_name: near_dup
301
  - path: context
302
  signal:
303
  signal_name: pii
304
  - path: context
305
  signal:
306
+ split_by_paragraph: false
307
  signal_name: lang_detection
308
  - path: context
309
  signal:
310
  embedding: gte-small
311
  namespace: lilac
312
  concept_name: positive-sentiment
313
+ draft: main
314
  signal_name: concept_score
315
  - path: context
316
  signal:
317
  embedding: gte-small
318
  namespace: lilac
319
  concept_name: non-english
320
+ draft: main
321
  signal_name: concept_score
322
  - path: context
323
  signal:
324
  embedding: gte-small
325
  namespace: lilac
326
  concept_name: toxicity
327
+ draft: main
328
  signal_name: concept_score
329
  - path: context
330
  signal:
331
  embedding: gte-small
332
  namespace: lilac
333
  concept_name: question
334
+ draft: main
335
  signal_name: concept_score
336
  - path: context
337
  signal:
338
  embedding: gte-small
339
  namespace: lilac
340
  concept_name: legal-termination
341
+ draft: main
342
  signal_name: concept_score
343
  - path: context
344
  signal:
345
  embedding: gte-small
346
  namespace: lilac
347
  concept_name: source-code
348
+ draft: main
349
  signal_name: concept_score
350
  - path: context
351
  signal:
352
  embedding: gte-small
353
  namespace: lilac
354
  concept_name: negative-sentiment
355
+ draft: main
356
  signal_name: concept_score
357
  - path: context
358
  signal:
359
  embedding: gte-small
360
  namespace: lilac
361
  concept_name: profanity
362
+ draft: main
363
  signal_name: concept_score
364
  - path: context
365
  signal:
366
  signal_name: text_statistics
367
  - path: question
368
  signal:
369
+ threshold: 0.85
370
  signal_name: near_dup
371
  - path: question
372
  signal:
373
  signal_name: pii
374
  - path: question
375
  signal:
376
+ split_by_paragraph: false
377
  signal_name: lang_detection
378
  - path: question
379
  signal:
 
383
  - text
384
  - '*'
385
  signal:
386
+ threshold: 0.85
387
  signal_name: near_dup
388
  - path:
389
  - answers
 
396
  - text
397
  - '*'
398
  signal:
399
+ split_by_paragraph: false
400
  signal_name: lang_detection
401
  - path:
402
  - answers
 
409
  embedding: gte-small
410
  namespace: lilac
411
  concept_name: legal-termination
412
+ draft: main
413
  signal_name: concept_score
414
  - path: question
415
  signal:
416
  embedding: gte-small
417
  namespace: lilac
418
  concept_name: negative-sentiment
419
+ draft: main
420
  signal_name: concept_score
421
  - path: question
422
  signal:
423
  embedding: gte-small
424
  namespace: lilac
425
  concept_name: non-english
426
+ draft: main
427
  signal_name: concept_score
428
  - path: question
429
  signal:
430
  embedding: gte-small
431
  namespace: lilac
432
  concept_name: positive-sentiment
433
+ draft: main
434
  signal_name: concept_score
435
  - path: question
436
  signal:
437
  embedding: gte-small
438
  namespace: lilac
439
  concept_name: profanity
440
+ draft: main
441
  signal_name: concept_score
442
  - path: question
443
  signal:
444
  embedding: gte-small
445
  namespace: lilac
446
  concept_name: question
447
+ draft: main
448
  signal_name: concept_score
449
  - path: question
450
  signal:
451
  embedding: gte-small
452
  namespace: lilac
453
  concept_name: source-code
454
+ draft: main
455
  signal_name: concept_score
456
  - path: question
457
  signal:
458
  embedding: gte-small
459
  namespace: lilac
460
  concept_name: toxicity
461
+ draft: main
462
  signal_name: concept_score
463
  - path:
464
  - answers
 
468
  embedding: gte-small
469
  namespace: lilac
470
  concept_name: legal-termination
471
+ draft: main
472
  signal_name: concept_score
473
  - path:
474
  - answers
 
478
  embedding: gte-small
479
  namespace: lilac
480
  concept_name: negative-sentiment
481
+ draft: main
482
  signal_name: concept_score
483
  - path:
484
  - answers
 
488
  embedding: gte-small
489
  namespace: lilac
490
  concept_name: non-english
491
+ draft: main
492
  signal_name: concept_score
493
  - path:
494
  - answers
 
498
  embedding: gte-small
499
  namespace: lilac
500
  concept_name: positive-sentiment
501
+ draft: main
502
  signal_name: concept_score
503
  - path:
504
  - answers
 
508
  embedding: gte-small
509
  namespace: lilac
510
  concept_name: profanity
511
+ draft: main
512
  signal_name: concept_score
513
  - path:
514
  - answers
 
518
  embedding: gte-small
519
  namespace: lilac
520
  concept_name: question
521
+ draft: main
522
  signal_name: concept_score
523
  - path:
524
  - answers
 
528
  embedding: gte-small
529
  namespace: lilac
530
  concept_name: source-code
531
+ draft: main
532
  signal_name: concept_score
533
  - path:
534
  - answers
 
538
  embedding: gte-small
539
  namespace: lilac
540
  concept_name: toxicity
541
+ draft: main
542
  signal_name: concept_score
543
  settings:
544
  ui:
 
548
  - - answers
549
  - text
550
  - '*'
551
+ markdown_paths: []
552
  preferred_embedding: gte-small
553
  - namespace: lilac
554
  name: databricks-dolly-15k-curated-en
555
+ tags: []
556
  source:
557
  dataset_name: argilla/databricks-dolly-15k-curated-en
558
+ config_name: null
559
+ split: null
560
+ sample_size: null
561
+ revision: null
562
+ load_from_disk: false
563
  source_name: huggingface
564
  embeddings:
565
  - path: original-context
 
572
  signals:
573
  - path: original-instruction
574
  signal:
575
+ threshold: 0.85
576
  signal_name: near_dup
577
  - path: original-instruction
578
  signal:
579
  signal_name: pii
580
  - path: original-instruction
581
  signal:
582
+ split_by_paragraph: false
583
  signal_name: lang_detection
584
  - path: original-instruction
585
  signal:
586
  signal_name: text_statistics
587
  - path: original-context
588
  signal:
589
+ threshold: 0.85
590
  signal_name: near_dup
591
  - path: original-context
592
  signal:
593
  signal_name: pii
594
  - path: original-context
595
  signal:
596
+ split_by_paragraph: false
597
  signal_name: lang_detection
598
  - path: original-context
599
  signal:
600
  signal_name: text_statistics
601
  - path: original-response
602
  signal:
603
+ threshold: 0.85
604
  signal_name: near_dup
605
  - path: original-response
606
  signal:
607
  signal_name: pii
608
  - path: original-response
609
  signal:
610
+ split_by_paragraph: false
611
  signal_name: lang_detection
612
  - path: original-response
613
  signal:
 
617
  - value
618
  - '*'
619
  signal:
620
+ threshold: 0.85
621
  signal_name: near_dup
622
  - path:
623
  - new-instruction
 
630
  - value
631
  - '*'
632
  signal:
633
+ split_by_paragraph: false
634
  signal_name: lang_detection
635
  - path:
636
  - new-instruction
 
643
  - value
644
  - '*'
645
  signal:
646
+ threshold: 0.85
647
  signal_name: near_dup
648
  - path:
649
  - new-context
 
656
  - value
657
  - '*'
658
  signal:
659
+ split_by_paragraph: false
660
  signal_name: lang_detection
661
  - path:
662
  - new-context
 
669
  - value
670
  - '*'
671
  signal:
672
+ threshold: 0.85
673
  signal_name: near_dup
674
  - path:
675
  - new-response
 
682
  - value
683
  - '*'
684
  signal:
685
+ split_by_paragraph: false
686
  signal_name: lang_detection
687
  - path:
688
  - new-response
 
695
  embedding: gte-small
696
  namespace: lilac
697
  concept_name: legal-termination
698
+ draft: main
699
  signal_name: concept_score
700
  - path: original-instruction
701
  signal:
702
  embedding: gte-small
703
  namespace: lilac
704
  concept_name: negative-sentiment
705
+ draft: main
706
  signal_name: concept_score
707
  - path: original-instruction
708
  signal:
709
  embedding: gte-small
710
  namespace: lilac
711
  concept_name: non-english
712
+ draft: main
713
  signal_name: concept_score
714
  - path: original-instruction
715
  signal:
716
  embedding: gte-small
717
  namespace: lilac
718
  concept_name: positive-sentiment
719
+ draft: main
720
  signal_name: concept_score
721
  - path: original-instruction
722
  signal:
723
  embedding: gte-small
724
  namespace: lilac
725
  concept_name: profanity
726
+ draft: main
727
  signal_name: concept_score
728
  - path: original-instruction
729
  signal:
730
  embedding: gte-small
731
  namespace: lilac
732
  concept_name: question
733
+ draft: main
734
  signal_name: concept_score
735
  - path: original-instruction
736
  signal:
737
  embedding: gte-small
738
  namespace: lilac
739
  concept_name: source-code
740
+ draft: main
741
  signal_name: concept_score
742
  - path: original-instruction
743
  signal:
744
  embedding: gte-small
745
  namespace: lilac
746
  concept_name: toxicity
747
+ draft: main
748
  signal_name: concept_score
749
  - path: original-context
750
  signal:
751
  embedding: gte-small
752
  namespace: lilac
753
  concept_name: legal-termination
754
+ draft: main
755
  signal_name: concept_score
756
  - path: original-context
757
  signal:
758
  embedding: gte-small
759
  namespace: lilac
760
  concept_name: negative-sentiment
761
+ draft: main
762
  signal_name: concept_score
763
  - path: original-context
764
  signal:
765
  embedding: gte-small
766
  namespace: lilac
767
  concept_name: non-english
768
+ draft: main
769
  signal_name: concept_score
770
  - path: original-context
771
  signal:
772
  embedding: gte-small
773
  namespace: lilac
774
  concept_name: positive-sentiment
775
+ draft: main
776
  signal_name: concept_score
777
  - path: original-context
778
  signal:
779
  embedding: gte-small
780
  namespace: lilac
781
  concept_name: profanity
782
+ draft: main
783
  signal_name: concept_score
784
  - path: original-context
785
  signal:
786
  embedding: gte-small
787
  namespace: lilac
788
  concept_name: question
789
+ draft: main
790
  signal_name: concept_score
791
  - path: original-context
792
  signal:
793
  embedding: gte-small
794
  namespace: lilac
795
  concept_name: source-code
796
+ draft: main
797
  signal_name: concept_score
798
  - path: original-context
799
  signal:
800
  embedding: gte-small
801
  namespace: lilac
802
  concept_name: toxicity
803
+ draft: main
804
  signal_name: concept_score
805
  - path: original-response
806
  signal:
807
  embedding: gte-small
808
  namespace: lilac
809
  concept_name: legal-termination
810
+ draft: main
811
  signal_name: concept_score
812
  - path: original-response
813
  signal:
814
  embedding: gte-small
815
  namespace: lilac
816
  concept_name: negative-sentiment
817
+ draft: main
818
  signal_name: concept_score
819
  - path: original-response
820
  signal:
821
  embedding: gte-small
822
  namespace: lilac
823
  concept_name: non-english
824
+ draft: main
825
  signal_name: concept_score
826
  - path: original-response
827
  signal:
828
  embedding: gte-small
829
  namespace: lilac
830
  concept_name: positive-sentiment
831
+ draft: main
832
  signal_name: concept_score
833
  - path: original-response
834
  signal:
835
  embedding: gte-small
836
  namespace: lilac
837
  concept_name: profanity
838
+ draft: main
839
  signal_name: concept_score
840
  - path: original-response
841
  signal:
842
  embedding: gte-small
843
  namespace: lilac
844
  concept_name: question
845
+ draft: main
846
  signal_name: concept_score
847
  - path: original-response
848
  signal:
849
  embedding: gte-small
850
  namespace: lilac
851
  concept_name: source-code
852
+ draft: main
853
  signal_name: concept_score
854
  - path: original-response
855
  signal:
856
  embedding: gte-small
857
  namespace: lilac
858
  concept_name: toxicity
859
+ draft: main
860
  signal_name: concept_score
861
  - path:
862
  - new-instruction
 
866
  embedding: gte-small
867
  namespace: lilac
868
  concept_name: legal-termination
869
+ draft: main
870
  signal_name: concept_score
871
  - path:
872
  - new-instruction
 
876
  embedding: gte-small
877
  namespace: lilac
878
  concept_name: negative-sentiment
879
+ draft: main
880
  signal_name: concept_score
881
  - path:
882
  - new-instruction
 
886
  embedding: gte-small
887
  namespace: lilac
888
  concept_name: non-english
889
+ draft: main
890
  signal_name: concept_score
891
  - path:
892
  - new-instruction
 
896
  embedding: gte-small
897
  namespace: lilac
898
  concept_name: positive-sentiment
899
+ draft: main
900
  signal_name: concept_score
901
  - path:
902
  - new-instruction
 
906
  embedding: gte-small
907
  namespace: lilac
908
  concept_name: profanity
909
+ draft: main
910
  signal_name: concept_score
911
  - path:
912
  - new-instruction
 
916
  embedding: gte-small
917
  namespace: lilac
918
  concept_name: question
919
+ draft: main
920
  signal_name: concept_score
921
  - path:
922
  - new-instruction
 
926
  embedding: gte-small
927
  namespace: lilac
928
  concept_name: source-code
929
+ draft: main
930
  signal_name: concept_score
931
  - path:
932
  - new-instruction
 
936
  embedding: gte-small
937
  namespace: lilac
938
  concept_name: toxicity
939
+ draft: main
940
  signal_name: concept_score
941
  - path:
942
  - new-context
 
946
  embedding: gte-small
947
  namespace: lilac
948
  concept_name: legal-termination
949
+ draft: main
950
  signal_name: concept_score
951
  - path:
952
  - new-context
 
956
  embedding: gte-small
957
  namespace: lilac
958
  concept_name: negative-sentiment
959
+ draft: main
960
  signal_name: concept_score
961
  - path:
962
  - new-context
 
966
  embedding: gte-small
967
  namespace: lilac
968
  concept_name: non-english
969
+ draft: main
970
  signal_name: concept_score
971
  - path:
972
  - new-context
 
976
  embedding: gte-small
977
  namespace: lilac
978
  concept_name: positive-sentiment
979
+ draft: main
980
  signal_name: concept_score
981
  - path:
982
  - new-context
 
986
  embedding: gte-small
987
  namespace: lilac
988
  concept_name: profanity
989
+ draft: main
990
  signal_name: concept_score
991
  - path:
992
  - new-context
 
996
  embedding: gte-small
997
  namespace: lilac
998
  concept_name: question
999
+ draft: main
1000
  signal_name: concept_score
1001
  - path:
1002
  - new-context
 
1006
  embedding: gte-small
1007
  namespace: lilac
1008
  concept_name: source-code
1009
+ draft: main
1010
  signal_name: concept_score
1011
  - path:
1012
  - new-context
 
1016
  embedding: gte-small
1017
  namespace: lilac
1018
  concept_name: toxicity
1019
+ draft: main
1020
  signal_name: concept_score
1021
  - path:
1022
  - new-response
 
1026
  embedding: gte-small
1027
  namespace: lilac
1028
  concept_name: legal-termination
1029
+ draft: main
1030
  signal_name: concept_score
1031
  - path:
1032
  - new-response
 
1036
  embedding: gte-small
1037
  namespace: lilac
1038
  concept_name: negative-sentiment
1039
+ draft: main
1040
  signal_name: concept_score
1041
  - path:
1042
  - new-response
 
1046
  embedding: gte-small
1047
  namespace: lilac
1048
  concept_name: non-english
1049
+ draft: main
1050
  signal_name: concept_score
1051
  - path:
1052
  - new-response
 
1056
  embedding: gte-small
1057
  namespace: lilac
1058
  concept_name: positive-sentiment
1059
+ draft: main
1060
  signal_name: concept_score
1061
  - path:
1062
  - new-response
 
1066
  embedding: gte-small
1067
  namespace: lilac
1068
  concept_name: profanity
1069
+ draft: main
1070
  signal_name: concept_score
1071
  - path:
1072
  - new-response
 
1076
  embedding: gte-small
1077
  namespace: lilac
1078
  concept_name: question
1079
+ draft: main
1080
  signal_name: concept_score
1081
  - path:
1082
  - new-response
 
1086
  embedding: gte-small
1087
  namespace: lilac
1088
  concept_name: source-code
1089
+ draft: main
1090
  signal_name: concept_score
1091
  - path:
1092
  - new-response
 
1096
  embedding: gte-small
1097
  namespace: lilac
1098
  concept_name: toxicity
1099
+ draft: main
1100
  signal_name: concept_score
1101
  settings:
1102
  ui:
 
1113
  - - new-response
1114
  - value
1115
  - '*'
1116
+ markdown_paths: []
1117
  preferred_embedding: gte-small
1118
  - namespace: lilac
1119
  name: piqa
1120
+ tags: []
1121
  source:
1122
  dataset_name: piqa
1123
+ config_name: null
1124
+ split: null
1125
+ sample_size: null
1126
+ revision: null
1127
+ load_from_disk: false
1128
  source_name: huggingface
1129
  embeddings:
1130
  - path: goal
 
1136
  signals:
1137
  - path: goal
1138
  signal:
1139
+ threshold: 0.85
1140
  signal_name: near_dup
1141
  - path: goal
1142
  signal:
1143
  signal_name: pii
1144
  - path: goal
1145
  signal:
1146
+ split_by_paragraph: false
1147
  signal_name: lang_detection
1148
  - path: goal
1149
  signal:
1150
  embedding: gte-small
1151
  namespace: lilac
1152
  concept_name: positive-sentiment
1153
+ draft: main
1154
  signal_name: concept_score
1155
  - path: goal
1156
  signal:
1157
  embedding: gte-small
1158
  namespace: lilac
1159
  concept_name: non-english
1160
+ draft: main
1161
  signal_name: concept_score
1162
  - path: goal
1163
  signal:
1164
  embedding: gte-small
1165
  namespace: lilac
1166
  concept_name: toxicity
1167
+ draft: main
1168
  signal_name: concept_score
1169
  - path: goal
1170
  signal:
1171
  embedding: gte-small
1172
  namespace: lilac
1173
  concept_name: question
1174
+ draft: main
1175
  signal_name: concept_score
1176
  - path: goal
1177
  signal:
1178
  embedding: gte-small
1179
  namespace: lilac
1180
  concept_name: legal-termination
1181
+ draft: main
1182
  signal_name: concept_score
1183
  - path: goal
1184
  signal:
1185
  embedding: gte-small
1186
  namespace: lilac
1187
  concept_name: source-code
1188
+ draft: main
1189
  signal_name: concept_score
1190
  - path: goal
1191
  signal:
1192
  embedding: gte-small
1193
  namespace: lilac
1194
  concept_name: negative-sentiment
1195
+ draft: main
1196
  signal_name: concept_score
1197
  - path: goal
1198
  signal:
1199
  embedding: gte-small
1200
  namespace: lilac
1201
  concept_name: profanity
1202
+ draft: main
1203
  signal_name: concept_score
1204
  - path: goal
1205
  signal:
1206
  signal_name: text_statistics
1207
  - path: sol1
1208
  signal:
1209
+ threshold: 0.85
1210
  signal_name: near_dup
1211
  - path: sol1
1212
  signal:
1213
  signal_name: pii
1214
  - path: sol1
1215
  signal:
1216
+ split_by_paragraph: false
1217
  signal_name: lang_detection
1218
  - path: sol1
1219
  signal:
1220
  embedding: gte-small
1221
  namespace: lilac
1222
  concept_name: positive-sentiment
1223
+ draft: main
1224
  signal_name: concept_score
1225
  - path: sol1
1226
  signal:
1227
  embedding: gte-small
1228
  namespace: lilac
1229
  concept_name: non-english
1230
+ draft: main
1231
  signal_name: concept_score
1232
  - path: sol1
1233
  signal:
1234
  embedding: gte-small
1235
  namespace: lilac
1236
  concept_name: toxicity
1237
+ draft: main
1238
  signal_name: concept_score
1239
  - path: sol1
1240
  signal:
1241
  embedding: gte-small
1242
  namespace: lilac
1243
  concept_name: question
1244
+ draft: main
1245
  signal_name: concept_score
1246
  - path: sol1
1247
  signal:
1248
  embedding: gte-small
1249
  namespace: lilac
1250
  concept_name: legal-termination
1251
+ draft: main
1252
  signal_name: concept_score
1253
  - path: sol1
1254
  signal:
1255
  embedding: gte-small
1256
  namespace: lilac
1257
  concept_name: source-code
1258
+ draft: main
1259
  signal_name: concept_score
1260
  - path: sol1
1261
  signal:
1262
  embedding: gte-small
1263
  namespace: lilac
1264
  concept_name: negative-sentiment
1265
+ draft: main
1266
  signal_name: concept_score
1267
  - path: sol1
1268
  signal:
1269
  embedding: gte-small
1270
  namespace: lilac
1271
  concept_name: profanity
1272
+ draft: main
1273
  signal_name: concept_score
1274
  - path: sol1
1275
  signal:
1276
  signal_name: text_statistics
1277
  - path: sol2
1278
  signal:
1279
+ threshold: 0.85
1280
  signal_name: near_dup
1281
  - path: sol2
1282
  signal:
1283
  signal_name: pii
1284
  - path: sol2
1285
  signal:
1286
+ split_by_paragraph: false
1287
  signal_name: lang_detection
1288
  - path: sol2
1289
  signal:
1290
  embedding: gte-small
1291
  namespace: lilac
1292
  concept_name: positive-sentiment
1293
+ draft: main
1294
  signal_name: concept_score
1295
  - path: sol2
1296
  signal:
1297
  embedding: gte-small
1298
  namespace: lilac
1299
  concept_name: non-english
1300
+ draft: main
1301
  signal_name: concept_score
1302
  - path: sol2
1303
  signal:
1304
  embedding: gte-small
1305
  namespace: lilac
1306
  concept_name: toxicity
1307
+ draft: main
1308
  signal_name: concept_score
1309
  - path: sol2
1310
  signal:
1311
  embedding: gte-small
1312
  namespace: lilac
1313
  concept_name: question
1314
+ draft: main
1315
  signal_name: concept_score
1316
  - path: sol2
1317
  signal:
1318
  embedding: gte-small
1319
  namespace: lilac
1320
  concept_name: legal-termination
1321
+ draft: main
1322
  signal_name: concept_score
1323
  - path: sol2
1324
  signal:
1325
  embedding: gte-small
1326
  namespace: lilac
1327
  concept_name: source-code
1328
+ draft: main
1329
  signal_name: concept_score
1330
  - path: sol2
1331
  signal:
1332
  embedding: gte-small
1333
  namespace: lilac
1334
  concept_name: negative-sentiment
1335
+ draft: main
1336
  signal_name: concept_score
1337
  - path: sol2
1338
  signal:
1339
  embedding: gte-small
1340
  namespace: lilac
1341
  concept_name: profanity
1342
+ draft: main
1343
  signal_name: concept_score
1344
  - path: sol2
1345
  signal:
 
1350
  - sol1
1351
  - sol2
1352
  - goal
1353
+ markdown_paths: []
1354
  preferred_embedding: gte-small
1355
  - namespace: lilac
1356
  name: OpenOrca-100k
1357
+ tags: []
1358
  source:
1359
  dataset_name: Open-Orca/OpenOrca
1360
+ config_name: null
1361
+ split: null
1362
  sample_size: 100000
1363
+ revision: null
1364
+ load_from_disk: false
1365
  source_name: huggingface
1366
  embeddings:
1367
  - path: question
 
1371
  signals:
1372
  - path: question
1373
  signal:
1374
+ threshold: 0.85
1375
  signal_name: near_dup
1376
  - path: question
1377
  signal:
1378
  signal_name: pii
1379
  - path: question
1380
  signal:
1381
+ split_by_paragraph: false
1382
  signal_name: lang_detection
1383
  - path: question
1384
  signal:
1385
  embedding: gte-small
1386
  namespace: lilac
1387
  concept_name: positive-sentiment
1388
+ draft: main
1389
  signal_name: concept_score
1390
  - path: question
1391
  signal:
1392
  embedding: gte-small
1393
  namespace: lilac
1394
  concept_name: non-english
1395
+ draft: main
1396
  signal_name: concept_score
1397
  - path: question
1398
  signal:
1399
  embedding: gte-small
1400
  namespace: lilac
1401
  concept_name: toxicity
1402
+ draft: main
1403
  signal_name: concept_score
1404
  - path: question
1405
  signal:
1406
  embedding: gte-small
1407
  namespace: lilac
1408
  concept_name: question
1409
+ draft: main
1410
  signal_name: concept_score
1411
  - path: question
1412
  signal:
1413
  embedding: gte-small
1414
  namespace: lilac
1415
  concept_name: legal-termination
1416
+ draft: main
1417
  signal_name: concept_score
1418
  - path: question
1419
  signal:
1420
  embedding: gte-small
1421
  namespace: lilac
1422
  concept_name: source-code
1423
+ draft: main
1424
  signal_name: concept_score
1425
  - path: question
1426
  signal:
1427
  embedding: gte-small
1428
  namespace: lilac
1429
  concept_name: negative-sentiment
1430
+ draft: main
1431
  signal_name: concept_score
1432
  - path: question
1433
  signal:
1434
  embedding: gte-small
1435
  namespace: lilac
1436
  concept_name: profanity
1437
+ draft: main
1438
  signal_name: concept_score
1439
  - path: question
1440
  signal:
1441
  signal_name: text_statistics
1442
  - path: response
1443
  signal:
1444
+ threshold: 0.85
1445
  signal_name: near_dup
1446
  - path: response
1447
  signal:
1448
  signal_name: pii
1449
  - path: response
1450
  signal:
1451
+ split_by_paragraph: false
1452
  signal_name: lang_detection
1453
  - path: response
1454
  signal:
1455
  embedding: gte-small
1456
  namespace: lilac
1457
  concept_name: positive-sentiment
1458
+ draft: main
1459
  signal_name: concept_score
1460
  - path: response
1461
  signal:
1462
  embedding: gte-small
1463
  namespace: lilac
1464
  concept_name: non-english
1465
+ draft: main
1466
  signal_name: concept_score
1467
  - path: response
1468
  signal:
1469
  embedding: gte-small
1470
  namespace: lilac
1471
  concept_name: toxicity
1472
+ draft: main
1473
  signal_name: concept_score
1474
  - path: response
1475
  signal:
1476
  embedding: gte-small
1477
  namespace: lilac
1478
  concept_name: question
1479
+ draft: main
1480
  signal_name: concept_score
1481
  - path: response
1482
  signal:
1483
  embedding: gte-small
1484
  namespace: lilac
1485
  concept_name: legal-termination
1486
+ draft: main
1487
  signal_name: concept_score
1488
  - path: response
1489
  signal:
1490
  embedding: gte-small
1491
  namespace: lilac
1492
  concept_name: source-code
1493
+ draft: main
1494
  signal_name: concept_score
1495
  - path: response
1496
  signal:
1497
  embedding: gte-small
1498
  namespace: lilac
1499
  concept_name: negative-sentiment
1500
+ draft: main
1501
  signal_name: concept_score
1502
  - path: response
1503
  signal:
1504
  embedding: gte-small
1505
  namespace: lilac
1506
  concept_name: profanity
1507
+ draft: main
1508
  signal_name: concept_score
1509
  - path: response
1510
  signal:
 
1514
  media_paths:
1515
  - question
1516
  - response
1517
+ markdown_paths: []
1518
  preferred_embedding: gte-small
1519
  - namespace: lilac
1520
  name: opus100-en-es-validation
1521
+ tags: []
1522
  source:
1523
  dataset_name: opus100
1524
  config_name: en-es
1525
  split: validation
1526
+ sample_size: null
1527
+ revision: null
1528
+ load_from_disk: false
1529
  source_name: huggingface
1530
  embeddings:
1531
  - path:
 
1541
  - translation
1542
  - en
1543
  signal:
1544
+ threshold: 0.85
1545
  signal_name: near_dup
1546
  - path:
1547
  - translation
 
1552
  - translation
1553
  - en
1554
  signal:
1555
+ split_by_paragraph: false
1556
  signal_name: lang_detection
1557
  - path:
1558
  - translation
 
1561
  embedding: gte-small
1562
  namespace: lilac
1563
  concept_name: positive-sentiment
1564
+ draft: main
1565
  signal_name: concept_score
1566
  - path:
1567
  - translation
 
1570
  embedding: gte-small
1571
  namespace: lilac
1572
  concept_name: non-english
1573
+ draft: main
1574
  signal_name: concept_score
1575
  - path:
1576
  - translation
 
1579
  embedding: gte-small
1580
  namespace: lilac
1581
  concept_name: toxicity
1582
+ draft: main
1583
  signal_name: concept_score
1584
  - path:
1585
  - translation
 
1588
  embedding: gte-small
1589
  namespace: lilac
1590
  concept_name: question
1591
+ draft: main
1592
  signal_name: concept_score
1593
  - path:
1594
  - translation
 
1597
  embedding: gte-small
1598
  namespace: lilac
1599
  concept_name: legal-termination
1600
+ draft: main
1601
  signal_name: concept_score
1602
  - path:
1603
  - translation
 
1606
  embedding: gte-small
1607
  namespace: lilac
1608
  concept_name: source-code
1609
+ draft: main
1610
  signal_name: concept_score
1611
  - path:
1612
  - translation
 
1615
  embedding: gte-small
1616
  namespace: lilac
1617
  concept_name: negative-sentiment
1618
+ draft: main
1619
  signal_name: concept_score
1620
  - path:
1621
  - translation
 
1624
  embedding: gte-small
1625
  namespace: lilac
1626
  concept_name: profanity
1627
+ draft: main
1628
  signal_name: concept_score
1629
  - path:
1630
  - translation
 
1635
  - translation
1636
  - es
1637
  signal:
1638
+ threshold: 0.85
1639
  signal_name: near_dup
1640
  - path:
1641
  - translation
 
1646
  - translation
1647
  - es
1648
  signal:
1649
+ split_by_paragraph: false
1650
  signal_name: lang_detection
1651
  - path:
1652
  - translation
 
1655
  embedding: gte-small
1656
  namespace: lilac
1657
  concept_name: positive-sentiment
1658
+ draft: main
1659
  signal_name: concept_score
1660
  - path:
1661
  - translation
 
1664
  embedding: gte-small
1665
  namespace: lilac
1666
  concept_name: non-english
1667
+ draft: main
1668
  signal_name: concept_score
1669
  - path:
1670
  - translation
 
1673
  embedding: gte-small
1674
  namespace: lilac
1675
  concept_name: toxicity
1676
+ draft: main
1677
  signal_name: concept_score
1678
  - path:
1679
  - translation
 
1682
  embedding: gte-small
1683
  namespace: lilac
1684
  concept_name: question
1685
+ draft: main
1686
  signal_name: concept_score
1687
  - path:
1688
  - translation
 
1691
  embedding: gte-small
1692
  namespace: lilac
1693
  concept_name: legal-termination
1694
+ draft: main
1695
  signal_name: concept_score
1696
  - path:
1697
  - translation
 
1700
  embedding: gte-small
1701
  namespace: lilac
1702
  concept_name: source-code
1703
+ draft: main
1704
  signal_name: concept_score
1705
  - path:
1706
  - translation
 
1709
  embedding: gte-small
1710
  namespace: lilac
1711
  concept_name: negative-sentiment
1712
+ draft: main
1713
  signal_name: concept_score
1714
  - path:
1715
  - translation
 
1718
  embedding: gte-small
1719
  namespace: lilac
1720
  concept_name: profanity
1721
+ draft: main
1722
  signal_name: concept_score
1723
  - path:
1724
  - translation
 
1732
  - es
1733
  - - translation
1734
  - en
1735
+ markdown_paths: []
1736
  preferred_embedding: gte-small
1737
  - namespace: lilac
1738
  name: mmlu_professional_law
1739
+ tags: []
1740
  source:
1741
  dataset_name: cais/mmlu
1742
  config_name: professional_law
1743
+ split: null
1744
+ sample_size: null
1745
+ revision: null
1746
+ load_from_disk: false
1747
  source_name: huggingface
1748
  embeddings:
1749
  - path: question
 
1755
  signals:
1756
  - path: question
1757
  signal:
1758
+ threshold: 0.85
1759
  signal_name: near_dup
1760
  - path: question
1761
  signal:
1762
  signal_name: pii
1763
  - path: question
1764
  signal:
1765
+ split_by_paragraph: false
1766
  signal_name: lang_detection
1767
  - path: question
1768
  signal:
1769
  embedding: gte-small
1770
  namespace: lilac
1771
  concept_name: positive-sentiment
1772
+ draft: main
1773
  signal_name: concept_score
1774
  - path: question
1775
  signal:
1776
  embedding: gte-small
1777
  namespace: lilac
1778
  concept_name: non-english
1779
+ draft: main
1780
  signal_name: concept_score
1781
  - path: question
1782
  signal:
1783
  embedding: gte-small
1784
  namespace: lilac
1785
  concept_name: toxicity
1786
+ draft: main
1787
  signal_name: concept_score
1788
  - path: question
1789
  signal:
1790
  embedding: gte-small
1791
  namespace: lilac
1792
  concept_name: question
1793
+ draft: main
1794
  signal_name: concept_score
1795
  - path: question
1796
  signal:
1797
  embedding: gte-small
1798
  namespace: lilac
1799
  concept_name: legal-termination
1800
+ draft: main
1801
  signal_name: concept_score
1802
  - path: question
1803
  signal:
1804
  embedding: gte-small
1805
  namespace: lilac
1806
  concept_name: source-code
1807
+ draft: main
1808
  signal_name: concept_score
1809
  - path: question
1810
  signal:
1811
  embedding: gte-small
1812
  namespace: lilac
1813
  concept_name: negative-sentiment
1814
+ draft: main
1815
  signal_name: concept_score
1816
  - path: question
1817
  signal:
1818
  embedding: gte-small
1819
  namespace: lilac
1820
  concept_name: profanity
1821
+ draft: main
1822
  signal_name: concept_score
1823
  - path: question
1824
  signal:
 
1827
  - choices
1828
  - '*'
1829
  signal:
1830
+ threshold: 0.85
1831
  signal_name: near_dup
1832
  - path:
1833
  - choices
 
1838
  - choices
1839
  - '*'
1840
  signal:
1841
+ split_by_paragraph: false
1842
  signal_name: lang_detection
1843
  - path:
1844
  - choices
 
1847
  embedding: gte-small
1848
  namespace: lilac
1849
  concept_name: positive-sentiment
1850
+ draft: main
1851
  signal_name: concept_score
1852
  - path:
1853
  - choices
 
1856
  embedding: gte-small
1857
  namespace: lilac
1858
  concept_name: non-english
1859
+ draft: main
1860
  signal_name: concept_score
1861
  - path:
1862
  - choices
 
1865
  embedding: gte-small
1866
  namespace: lilac
1867
  concept_name: toxicity
1868
+ draft: main
1869
  signal_name: concept_score
1870
  - path:
1871
  - choices
 
1874
  embedding: gte-small
1875
  namespace: lilac
1876
  concept_name: question
1877
+ draft: main
1878
  signal_name: concept_score
1879
  - path:
1880
  - choices
 
1883
  embedding: gte-small
1884
  namespace: lilac
1885
  concept_name: legal-termination
1886
+ draft: main
1887
  signal_name: concept_score
1888
  - path:
1889
  - choices
 
1892
  embedding: gte-small
1893
  namespace: lilac
1894
  concept_name: source-code
1895
+ draft: main
1896
  signal_name: concept_score
1897
  - path:
1898
  - choices
 
1901
  embedding: gte-small
1902
  namespace: lilac
1903
  concept_name: negative-sentiment
1904
+ draft: main
1905
  signal_name: concept_score
1906
  - path:
1907
  - choices
 
1910
  embedding: gte-small
1911
  namespace: lilac
1912
  concept_name: negative-sentiment
1913
+ draft: main
1914
  signal_name: concept_score
1915
  - path:
1916
  - choices
 
1919
  embedding: gte-small
1920
  namespace: lilac
1921
  concept_name: profanity
1922
+ draft: main
1923
  signal_name: concept_score
1924
  - path:
1925
  - choices
 
1932
  - question
1933
  - - choices
1934
  - '*'
1935
+ markdown_paths: []
1936
  preferred_embedding: gte-small
1937
  - namespace: lilac
1938
  name: pile-of-law-r-legaladvice
1939
+ tags: []
1940
  source:
1941
  dataset_name: pile-of-law/pile-of-law
1942
  config_name: r_legaladvice
1943
+ split: null
1944
+ sample_size: null
1945
+ revision: null
1946
+ load_from_disk: false
1947
  source_name: huggingface
1948
  embeddings:
1949
  - path: text
 
1951
  signals:
1952
  - path: text
1953
  signal:
1954
+ threshold: 0.85
1955
  signal_name: near_dup
1956
  - path: text
1957
  signal:
1958
  signal_name: pii
1959
  - path: text
1960
  signal:
1961
+ split_by_paragraph: false
1962
  signal_name: lang_detection
1963
  - path: text
1964
  signal:
1965
  embedding: gte-small
1966
  namespace: lilac
1967
  concept_name: positive-sentiment
1968
+ draft: main
1969
  signal_name: concept_score
1970
  - path: text
1971
  signal:
1972
  embedding: gte-small
1973
  namespace: lilac
1974
  concept_name: non-english
1975
+ draft: main
1976
  signal_name: concept_score
1977
  - path: text
1978
  signal:
1979
  embedding: gte-small
1980
  namespace: lilac
1981
  concept_name: toxicity
1982
+ draft: main
1983
  signal_name: concept_score
1984
  - path: text
1985
  signal:
1986
  embedding: gte-small
1987
  namespace: lilac
1988
  concept_name: question
1989
+ draft: main
1990
  signal_name: concept_score
1991
  - path: text
1992
  signal:
1993
  embedding: gte-small
1994
  namespace: lilac
1995
  concept_name: legal-termination
1996
+ draft: main
1997
  signal_name: concept_score
1998
  - path: text
1999
  signal:
2000
  embedding: gte-small
2001
  namespace: lilac
2002
  concept_name: source-code
2003
+ draft: main
2004
  signal_name: concept_score
2005
  - path: text
2006
  signal:
2007
  embedding: gte-small
2008
  namespace: lilac
2009
  concept_name: negative-sentiment
2010
+ draft: main
2011
  signal_name: concept_score
2012
  - path: text
2013
  signal:
2014
  embedding: gte-small
2015
  namespace: lilac
2016
  concept_name: profanity
2017
+ draft: main
2018
  signal_name: concept_score
2019
  - path: text
2020
  signal:
 
2023
  ui:
2024
  media_paths:
2025
  - text
2026
+ markdown_paths: []
2027
  preferred_embedding: gte-small
2028
  - namespace: lilac
2029
  name: science-qa-derek-thomas
2030
+ tags: []
2031
  source:
2032
  dataset_name: derek-thomas/ScienceQA
2033
+ config_name: null
2034
+ split: null
2035
+ sample_size: null
2036
+ revision: null
2037
+ load_from_disk: false
2038
  source_name: huggingface
2039
  embeddings:
2040
  - path: lecture
 
2042
  signals:
2043
  - path: lecture
2044
  signal:
2045
+ threshold: 0.85
2046
  signal_name: near_dup
2047
  - path: lecture
2048
  signal:
2049
  signal_name: pii
2050
  - path: lecture
2051
  signal:
2052
+ split_by_paragraph: false
2053
  signal_name: lang_detection
2054
  - path: lecture
2055
  signal:
 
2059
  embedding: gte-small
2060
  namespace: lilac
2061
  concept_name: legal-termination
2062
+ draft: main
2063
  signal_name: concept_score
2064
  - path: lecture
2065
  signal:
2066
  embedding: gte-small
2067
  namespace: lilac
2068
  concept_name: negative-sentiment
2069
+ draft: main
2070
  signal_name: concept_score
2071
  - path: lecture
2072
  signal:
2073
  embedding: gte-small
2074
  namespace: lilac
2075
  concept_name: non-english
2076
+ draft: main
2077
  signal_name: concept_score
2078
  - path: lecture
2079
  signal:
2080
  embedding: gte-small
2081
  namespace: lilac
2082
  concept_name: positive-sentiment
2083
+ draft: main
2084
  signal_name: concept_score
2085
  - path: lecture
2086
  signal:
2087
  embedding: gte-small
2088
  namespace: lilac
2089
  concept_name: profanity
2090
+ draft: main
2091
  signal_name: concept_score
2092
  - path: lecture
2093
  signal:
2094
  embedding: gte-small
2095
  namespace: lilac
2096
  concept_name: question
2097
+ draft: main
2098
  signal_name: concept_score
2099
  - path: lecture
2100
  signal:
2101
  embedding: gte-small
2102
  namespace: lilac
2103
  concept_name: source-code
2104
+ draft: main
2105
  signal_name: concept_score
2106
  - path: lecture
2107
  signal:
2108
  embedding: gte-small
2109
  namespace: lilac
2110
  concept_name: toxicity
2111
+ draft: main
2112
  signal_name: concept_score
2113
  settings:
2114
  ui:
2115
  media_paths:
2116
  - lecture
2117
+ markdown_paths: []
2118
  preferred_embedding: gte-small
2119
  - namespace: lilac
2120
  name: enron-emails
2121
+ tags: []
2122
  source:
2123
  dataset_name: EleutherAI/pile
2124
  config_name: enron_emails
2125
+ split: null
2126
  sample_size: 100000
2127
+ revision: null
2128
+ load_from_disk: false
2129
  source_name: huggingface
2130
  embeddings:
2131
  - path: text
 
2133
  signals:
2134
  - path: text
2135
  signal:
2136
+ threshold: 0.85
2137
  signal_name: near_dup
2138
  - path: text
2139
  signal:
2140
  signal_name: pii
2141
  - path: text
2142
  signal:
2143
+ split_by_paragraph: false
2144
  signal_name: lang_detection
2145
  - path: text
2146
  signal:
2147
  embedding: gte-small
2148
  namespace: lilac
2149
  concept_name: positive-sentiment
2150
+ draft: main
2151
  signal_name: concept_score
2152
  - path: text
2153
  signal:
2154
  embedding: gte-small
2155
  namespace: lilac
2156
  concept_name: non-english
2157
+ draft: main
2158
  signal_name: concept_score
2159
  - path: text
2160
  signal:
2161
  embedding: gte-small
2162
  namespace: lilac
2163
  concept_name: toxicity
2164
+ draft: main
2165
  signal_name: concept_score
2166
  - path: text
2167
  signal:
2168
  embedding: gte-small
2169
  namespace: lilac
2170
  concept_name: question
2171
+ draft: main
2172
  signal_name: concept_score
2173
  - path: text
2174
  signal:
2175
  embedding: gte-small
2176
  namespace: lilac
2177
  concept_name: legal-termination
2178
+ draft: main
2179
  signal_name: concept_score
2180
  - path: text
2181
  signal:
2182
  embedding: gte-small
2183
  namespace: lilac
2184
  concept_name: source-code
2185
+ draft: main
2186
  signal_name: concept_score
2187
  - path: text
2188
  signal:
2189
  embedding: gte-small
2190
  namespace: lilac
2191
  concept_name: negative-sentiment
2192
+ draft: main
2193
  signal_name: concept_score
2194
  - path: text
2195
  signal:
2196
  embedding: gte-small
2197
  namespace: lilac
2198
  concept_name: profanity
2199
+ draft: main
2200
  signal_name: concept_score
2201
  - path: text
2202
  signal:
 
2205
  ui:
2206
  media_paths:
2207
  - text
2208
+ markdown_paths: []
2209
  preferred_embedding: gte-small
2210
  - namespace: lilac
2211
  name: the_movies_dataset
2212
+ tags: []
2213
  source:
2214
  filepaths:
2215
  - https://storage.googleapis.com/lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv
2216
+ delim: ','
2217
+ header: true
2218
+ names: null
2219
  source_name: csv
2220
  embeddings:
2221
  - path: overview
 
2223
  signals:
2224
  - path: overview
2225
  signal:
2226
+ threshold: 0.85
2227
  signal_name: near_dup
2228
  - path: overview
2229
  signal:
2230
  signal_name: pii
2231
  - path: overview
2232
  signal:
2233
+ split_by_paragraph: false
2234
  signal_name: lang_detection
2235
  - path: overview
2236
  signal:
 
2240
  embedding: gte-small
2241
  namespace: lilac
2242
  concept_name: legal-termination
2243
+ draft: main
2244
  signal_name: concept_score
2245
  - path: overview
2246
  signal:
2247
  embedding: gte-small
2248
  namespace: lilac
2249
  concept_name: negative-sentiment
2250
+ draft: main
2251
  signal_name: concept_score
2252
  - path: overview
2253
  signal:
2254
  embedding: gte-small
2255
  namespace: lilac
2256
  concept_name: non-english
2257
+ draft: main
2258
  signal_name: concept_score
2259
  - path: overview
2260
  signal:
2261
  embedding: gte-small
2262
  namespace: lilac
2263
  concept_name: positive-sentiment
2264
+ draft: main
2265
  signal_name: concept_score
2266
  - path: overview
2267
  signal:
2268
  embedding: gte-small
2269
  namespace: lilac
2270
  concept_name: profanity
2271
+ draft: main
2272
  signal_name: concept_score
2273
  - path: overview
2274
  signal:
2275
  embedding: gte-small
2276
  namespace: lilac
2277
  concept_name: question
2278
+ draft: main
2279
  signal_name: concept_score
2280
  - path: overview
2281
  signal:
2282
  embedding: gte-small
2283
  namespace: lilac
2284
  concept_name: source-code
2285
+ draft: main
2286
  signal_name: concept_score
2287
  - path: overview
2288
  signal:
2289
  embedding: gte-small
2290
  namespace: lilac
2291
  concept_name: toxicity
2292
+ draft: main
2293
  signal_name: concept_score
2294
  settings:
2295
  ui:
2296
  media_paths:
2297
  - overview
2298
+ markdown_paths: []
2299
  preferred_embedding: gte-small
2300
+ signals: []
2301
+ concept_model_cache_embeddings: []