Spaces:
Running
Running
Commit
·
2e63f1e
1
Parent(s):
c4bc2a0
Add C4 index
Browse files- constants.py +1 -0
constants.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4 |
CORPUS_BY_DESC = {
|
5 |
'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v4_rpj_llama_s4',
|
6 |
'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
|
|
|
7 |
'Pile-val (LLaMA tokenizer), 390M tokens': 'v4_pileval_llama',
|
8 |
'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
|
9 |
'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
|
|
|
4 |
CORPUS_BY_DESC = {
|
5 |
'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v4_rpj_llama_s4',
|
6 |
'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
|
7 |
+
'C4-train (LLaMA tokenizer), 200B tokens': 'v4_c4train_llama',
|
8 |
'Pile-val (LLaMA tokenizer), 390M tokens': 'v4_pileval_llama',
|
9 |
'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
|
10 |
'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
|