Upload actual_config.yaml with huggingface_hub
Browse files- actual_config.yaml +104 -0
actual_config.yaml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data_config:
|
2 |
+
streaming: true
|
3 |
+
validation_size_max: 1024
|
4 |
+
metadata_config:
|
5 |
+
random_sample_metadata: true
|
6 |
+
random_sample_metadata_calculate_size: 16384
|
7 |
+
random_sample_metadata_weights:
|
8 |
+
html: 0.5
|
9 |
+
timestamp: 11.56111563110182
|
10 |
+
website_desc: 11.033764368362439
|
11 |
+
title: 1.0644297987874418
|
12 |
+
generation_datasource: 1.0
|
13 |
+
entity_paragraph: 11.077104653627899
|
14 |
+
metadata_list:
|
15 |
+
- html
|
16 |
+
- timestamp
|
17 |
+
- website_description
|
18 |
+
- title
|
19 |
+
- url
|
20 |
+
- datasource
|
21 |
+
- length
|
22 |
+
- entity_paragraph
|
23 |
+
metadata_column_list:
|
24 |
+
- html
|
25 |
+
- timestamp
|
26 |
+
- website_desc
|
27 |
+
- title
|
28 |
+
- generation_datasource
|
29 |
+
- entity_paragraph
|
30 |
+
local_metadata_special_tokens:
|
31 |
+
entity_paragraph: entity
|
32 |
+
metadata_sep: ' | '
|
33 |
+
metadata_key_value_sep: ': '
|
34 |
+
metadata_probability: 0.5
|
35 |
+
treat_local_metadata_as_regular_text: true
|
36 |
+
add_local_metadata_special_tokens_in_prefix: true
|
37 |
+
metadata_prefix_sep: ' |||'
|
38 |
+
metadata_prefix_start_seq: ''
|
39 |
+
max_seq_len: 1024
|
40 |
+
html_parser_config:
|
41 |
+
all_tags_rules:
|
42 |
+
attributes_to_keep:
|
43 |
+
- class
|
44 |
+
- id
|
45 |
+
txt_max_chr_len: 0
|
46 |
+
txt_min_chr_len: -.inf
|
47 |
+
tags_exceptions_to_txt_max_min_chr_len:
|
48 |
+
- table
|
49 |
+
- tr
|
50 |
+
- th
|
51 |
+
- td
|
52 |
+
- colgroup
|
53 |
+
- thead
|
54 |
+
- tfoot
|
55 |
+
- tbody
|
56 |
+
tags_to_remove_alone_tag_name:
|
57 |
+
- body
|
58 |
+
tags_to_remove_alone_txt_max_chr_len:
|
59 |
+
- .inf
|
60 |
+
tags_to_remove_alone_txt_min_chr_len:
|
61 |
+
- 0.0
|
62 |
+
local_metadata_special_token_start:
|
63 |
+
entity_paragraph: <ENTITY_CHAIN>
|
64 |
+
local_metadata_special_token_end:
|
65 |
+
entity_paragraph: ' </ENTITY_CHAIN> '
|
66 |
+
experiment: with_metadata_datasetv2
|
67 |
+
per_device_eval_batch_size: 32
|
68 |
+
per_device_train_batch_size: 32
|
69 |
+
dataset_name: bs-modeling-metadata/c4-en-html-with-metadata
|
70 |
+
dataset_config_name: null
|
71 |
+
train_file: '*.jsonl.gz'
|
72 |
+
validation_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
|
73 |
+
overwrite_cache: false
|
74 |
+
cache_dir: null
|
75 |
+
extension: null
|
76 |
+
preprocessing_num_workers: 48
|
77 |
+
validation_split_percentage: 5
|
78 |
+
block_size: null
|
79 |
+
map_batch_size: 1
|
80 |
+
weight_decay: 0.01
|
81 |
+
learning_rate: 1.0e-05
|
82 |
+
num_train_epochs: 1
|
83 |
+
max_train_steps: 100000
|
84 |
+
lr_scheduler_type: linear
|
85 |
+
num_warmup_steps: 6000
|
86 |
+
seed: 42
|
87 |
+
out_dir: /mnt/ssd-1/bigscience-metadata/lower-lr-2-lower-html-weight
|
88 |
+
model_name: gpt2-xl
|
89 |
+
project_name: metadata_lm
|
90 |
+
jobid: ''
|
91 |
+
start_with_eval: false
|
92 |
+
extra_steps_to_eval_save_at:
|
93 |
+
- 2
|
94 |
+
evaluation_strategy: STEPS
|
95 |
+
eval_num_per_epoch: 3
|
96 |
+
eval_steps: 2000
|
97 |
+
save_strategy: STEPS
|
98 |
+
save_num_per_epoch: 3
|
99 |
+
save_steps: 2000
|
100 |
+
do_train: true
|
101 |
+
do_eval: true
|
102 |
+
gradient_checkpointing: true
|
103 |
+
resume_from_checkpoint_dir: null
|
104 |
+
gradient_accumulation_steps: 1
|