pretrain fixed bigcode/the-stack-smol-xl dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -164,10 +164,12 @@ datasets_configs = [
|
|
164 |
#
|
165 |
[
|
166 |
# 102 MB, 8,700
|
167 |
-
{'path': 'bigcode/the-stack-smol-xl', '
|
168 |
for name in [
|
|
|
|
|
169 |
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
170 |
-
'augeas', 'awk', '
|
171 |
'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
|
172 |
'css', 'cuda', 'dart', 'dockerfile', 'elixir',
|
173 |
'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
|
@@ -175,7 +177,7 @@ datasets_configs = [
|
|
175 |
'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
|
176 |
'literate-agda', 'literate-coffeescript', 'literate-haskell',
|
177 |
'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
|
178 |
-
'ocaml', 'pascal', 'perl', 'php', '
|
179 |
'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
|
180 |
'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
|
181 |
'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
|
|
|
164 |
#
|
165 |
[
|
166 |
# 102 MB, 8,700
|
167 |
+
{'path': 'bigcode/the-stack-smol-xl', 'data_dir': f'data/{name}', 'format': lambda n: n['content']}
|
168 |
for name in [
|
169 |
+
# 'batchfile' - unsafe
|
170 |
+
# 'powershell' - unsafe
|
171 |
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
172 |
+
'augeas', 'awk', 'bison', 'bluespec', 'c',
|
173 |
'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
|
174 |
'css', 'cuda', 'dart', 'dockerfile', 'elixir',
|
175 |
'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
|
|
|
177 |
'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
|
178 |
'literate-agda', 'literate-coffeescript', 'literate-haskell',
|
179 |
'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
|
180 |
+
'ocaml', 'pascal', 'perl', 'php', 'prolog',
|
181 |
'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
|
182 |
'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
|
183 |
'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
|