mtasic85 commited on
Commit
64dc4aa
1 Parent(s): 6527e94

pretrain dataset

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -128,7 +128,7 @@ datasets_configs = [
128
  # math
129
  #
130
  # 2.87 GB, 552,000 - images/text - we use only latex text, top 10%
131
- {'path': 'OleehyO/latex-formulas', 'name': 'cleaned_formulas', 'split': 'train[:10%]', 'format': lambda n: n['latex_formula']},
132
  # 12.2 MB, 500,000
133
  {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
134
  # 125 MB, 1,000,000
 
128
  # math
129
  #
130
  # 2.87 GB, 552,000 - images/text - we use only latex text, top 10%
131
+ {'path': 'OleehyO/latex-formulas', 'data_dir': 'cleaned_formulas', 'split': 'train[:10%]', 'format': lambda n: n['latex_formula']},
132
  # 12.2 MB, 500,000
133
  {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
134
  # 125 MB, 1,000,000