mtasic85 commited on
Commit
e5c29b5
1 Parent(s): 636fa84

pretrain dataset

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -96,7 +96,7 @@ datasets_configs = [
96
  # ~3 GB, 4,976,850
97
  {'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
98
  for name in [
99
- # 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
100
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
101
  ]
102
  ],
@@ -109,11 +109,11 @@ datasets_configs = [
109
  # {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
110
  # for i in range(0, 100, 20)
111
  # ],
112
- # ~17.6 GB, ~6.41M rows
113
- [
114
- {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
115
- for i in range(0, 100, 20)
116
- ],
117
  # 65.1 MB, 7,819
118
  {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
119
 
 
96
  # ~3 GB, 4,976,850
97
  {'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
98
  for name in [
99
+ 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
100
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
101
  ]
102
  ],
 
109
  # {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
110
  # for i in range(0, 100, 20)
111
  # ],
112
+ ## ~17.6 GB, ~6.41M rows
113
+ # [
114
+ # {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
115
+ # for i in range(0, 100, 20)
116
+ # ],
117
  # 65.1 MB, 7,819
118
  {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
119