pretrain dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -96,7 +96,7 @@ datasets_configs = [
|
|
96 |
# ~3 GB, 4,976,850
|
97 |
{'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
|
98 |
for name in [
|
99 |
-
|
100 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
101 |
]
|
102 |
],
|
@@ -109,11 +109,11 @@ datasets_configs = [
|
|
109 |
# {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
|
110 |
# for i in range(0, 100, 20)
|
111 |
# ],
|
112 |
-
|
113 |
-
[
|
114 |
-
|
115 |
-
|
116 |
-
],
|
117 |
# 65.1 MB, 7,819
|
118 |
{'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
|
119 |
|
|
|
96 |
# ~3 GB, 4,976,850
|
97 |
{'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
|
98 |
for name in [
|
99 |
+
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
100 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
101 |
]
|
102 |
],
|
|
|
109 |
# {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
|
110 |
# for i in range(0, 100, 20)
|
111 |
# ],
|
112 |
+
## ~17.6 GB, ~6.41M rows
|
113 |
+
# [
|
114 |
+
# {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
115 |
+
# for i in range(0, 100, 20)
|
116 |
+
# ],
|
117 |
# 65.1 MB, 7,819
|
118 |
{'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
|
119 |
|