Remove auth token
Browse files- run_cat.py +0 -2
- run_mlm_local.py +1 -8
run_cat.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3 |
# ## 4. Pre-train BERT on processed dataset
|
4 |
|
5 |
import os
|
6 |
-
from huggingface_hub import HfFolder
|
7 |
|
8 |
# hyperparameters
|
9 |
hyperparameters = {
|
@@ -11,7 +10,6 @@ hyperparameters = {
|
|
11 |
"dataset_id": "chaoyan/processed_bert_dataset",
|
12 |
"tokenizer_id": "cat_tokenizer",
|
13 |
"repository_id": "bert-base-uncased-cat",
|
14 |
-
"hf_hub_token": HfFolder.get_token(), # need to be login in with `huggingface-cli login`
|
15 |
"max_steps": 100_000,
|
16 |
"per_device_train_batch_size": 16,
|
17 |
"learning_rate": 5e-5,
|
|
|
3 |
# ## 4. Pre-train BERT on processed dataset
|
4 |
|
5 |
import os
|
|
|
6 |
|
7 |
# hyperparameters
|
8 |
hyperparameters = {
|
|
|
10 |
"dataset_id": "chaoyan/processed_bert_dataset",
|
11 |
"tokenizer_id": "cat_tokenizer",
|
12 |
"repository_id": "bert-base-uncased-cat",
|
|
|
13 |
"max_steps": 100_000,
|
14 |
"per_device_train_batch_size": 16,
|
15 |
"learning_rate": 5e-5,
|
run_mlm_local.py
CHANGED
@@ -42,10 +42,6 @@ class ScriptArguments:
|
|
42 |
default=None,
|
43 |
metadata={"help": "The repository id where the model will be saved or loaded from for futher pre-training."},
|
44 |
)
|
45 |
-
hf_hub_token: str = field(
|
46 |
-
default=False,
|
47 |
-
metadata={"help": "The Token used to push models, metrics and logs to the Hub."},
|
48 |
-
)
|
49 |
model_config_id: Optional[str] = field(
|
50 |
default="bert-base-uncased", metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
51 |
)
|
@@ -76,7 +72,7 @@ def run_mlm():
|
|
76 |
# load processed dataset
|
77 |
train_dataset = load_dataset(script_args.dataset_id, split="train")
|
78 |
# load trained tokenizer
|
79 |
-
tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id
|
80 |
|
81 |
# load model from config (for training from scratch)
|
82 |
logger.info("Training new model from scratch")
|
@@ -106,9 +102,6 @@ def run_mlm():
|
|
106 |
save_steps=5_000,
|
107 |
save_total_limit=2,
|
108 |
report_to="tensorboard",
|
109 |
-
# push to hub parameters
|
110 |
-
# hub_strategy="every_save",
|
111 |
-
# hub_model_id=script_args.repository_id,
|
112 |
# pretraining
|
113 |
ddp_find_unused_parameters=True,
|
114 |
# throughput_warmup_steps=2, # !!! ?
|
|
|
42 |
default=None,
|
43 |
metadata={"help": "The repository id where the model will be saved or loaded from for futher pre-training."},
|
44 |
)
|
|
|
|
|
|
|
|
|
45 |
model_config_id: Optional[str] = field(
|
46 |
default="bert-base-uncased", metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
47 |
)
|
|
|
72 |
# load processed dataset
|
73 |
train_dataset = load_dataset(script_args.dataset_id, split="train")
|
74 |
# load trained tokenizer
|
75 |
+
tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id)
|
76 |
|
77 |
# load model from config (for training from scratch)
|
78 |
logger.info("Training new model from scratch")
|
|
|
102 |
save_steps=5_000,
|
103 |
save_total_limit=2,
|
104 |
report_to="tensorboard",
|
|
|
|
|
|
|
105 |
# pretraining
|
106 |
ddp_find_unused_parameters=True,
|
107 |
# throughput_warmup_steps=2, # !!! ?
|