pretrain dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -128,7 +128,7 @@ datasets_configs = [
|
|
128 |
# math
|
129 |
#
|
130 |
# 2.87 GB, 552,000 - images/text - we use only latex text, top 10%
|
131 |
-
{'path': 'OleehyO/latex-formulas', '
|
132 |
# 12.2 MB, 500,000
|
133 |
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
|
134 |
# 125 MB, 1,000,000
|
|
|
128 |
# math
|
129 |
#
|
130 |
# 2.87 GB, 552,000 - images/text - we use only latex text, top 10%
|
131 |
+
{'path': 'OleehyO/latex-formulas', 'data_dir': 'cleaned_formulas', 'split': 'train[:10%]', 'format': lambda n: n['latex_formula']},
|
132 |
# 12.2 MB, 500,000
|
133 |
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
|
134 |
# 125 MB, 1,000,000
|