mtasic85 commited on
Commit
df39d2c
1 Parent(s): 804c80b

pretrain fixed bigcode/the-stack-smol-xl dataset

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -164,10 +164,12 @@ datasets_configs = [
164
  #
165
  [
166
  # 102 MB, 8,700
167
- {'path': 'bigcode/the-stack-smol-xl', 'name': name, 'format': lambda n: n['content']}
168
  for name in [
 
 
169
  'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
170
- 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
171
  'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
172
  'css', 'cuda', 'dart', 'dockerfile', 'elixir',
173
  'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
@@ -175,7 +177,7 @@ datasets_configs = [
175
  'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
176
  'literate-agda', 'literate-coffeescript', 'literate-haskell',
177
  'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
178
- 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
179
  'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
180
  'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
181
  'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
 
164
  #
165
  [
166
  # 102 MB, 8,700
167
+ {'path': 'bigcode/the-stack-smol-xl', 'data_dir': f'data/{name}', 'format': lambda n: n['content']}
168
  for name in [
169
+ # 'batchfile' - unsafe
170
+ # 'powershell' - unsafe
171
  'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
172
+ 'augeas', 'awk', 'bison', 'bluespec', 'c',
173
  'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
174
  'css', 'cuda', 'dart', 'dockerfile', 'elixir',
175
  'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
 
177
  'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
178
  'literate-agda', 'literate-coffeescript', 'literate-haskell',
179
  'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
180
+ 'ocaml', 'pascal', 'perl', 'php', 'prolog',
181
  'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
182
  'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
183
  'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',