chaoyan
/

bert-base-uncased-cat

Model card Files Files and versions Community

chaoyan commited on Aug 29, 2022

Commit

fcc13f9

·

1 Parent(s): e65e319

Remove auth token

Files changed (2) hide show

run_cat.py +0 -2
run_mlm_local.py +1 -8

run_cat.py CHANGED Viewed

@@ -3,7 +3,6 @@
 # ## 4. Pre-train BERT on processed dataset
 import os
-from huggingface_hub import HfFolder
 # hyperparameters
 hyperparameters = {
@@ -11,7 +10,6 @@ hyperparameters = {
     "dataset_id": "chaoyan/processed_bert_dataset",
     "tokenizer_id": "cat_tokenizer",
     "repository_id": "bert-base-uncased-cat",
-    "hf_hub_token": HfFolder.get_token(),  # need to be login in with `huggingface-cli login`
     "max_steps": 100_000,
     "per_device_train_batch_size": 16,
     "learning_rate": 5e-5,

 # ## 4. Pre-train BERT on processed dataset
 import os
 # hyperparameters
 hyperparameters = {
     "dataset_id": "chaoyan/processed_bert_dataset",
     "tokenizer_id": "cat_tokenizer",
     "repository_id": "bert-base-uncased-cat",
     "max_steps": 100_000,
     "per_device_train_batch_size": 16,
     "learning_rate": 5e-5,

run_mlm_local.py CHANGED Viewed

@@ -42,10 +42,6 @@ class ScriptArguments:
         default=None,
         metadata={"help": "The repository id where the model will be saved or loaded from for futher pre-training."},
     )
-    hf_hub_token: str = field(
-        default=False,
-        metadata={"help": "The Token used to push models, metrics and logs to the Hub."},
-    )
     model_config_id: Optional[str] = field(
         default="bert-base-uncased", metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
@@ -76,7 +72,7 @@ def run_mlm():
     # load processed dataset
     train_dataset = load_dataset(script_args.dataset_id, split="train")
     # load trained tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id, use_auth_token=script_args.hf_hub_token)
     # load model from config (for training from scratch)
     logger.info("Training new model from scratch")
@@ -106,9 +102,6 @@ def run_mlm():
         save_steps=5_000,
         save_total_limit=2,
         report_to="tensorboard",
-        # push to hub parameters
-        # hub_strategy="every_save",
-        # hub_model_id=script_args.repository_id,
         # pretraining
         ddp_find_unused_parameters=True,
         # throughput_warmup_steps=2, # !!! ?

         default=None,
         metadata={"help": "The repository id where the model will be saved or loaded from for futher pre-training."},
     )
     model_config_id: Optional[str] = field(
         default="bert-base-uncased", metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
     # load processed dataset
     train_dataset = load_dataset(script_args.dataset_id, split="train")
     # load trained tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id)
     # load model from config (for training from scratch)
     logger.info("Training new model from scratch")
         save_steps=5_000,
         save_total_limit=2,
         report_to="tensorboard",
         # pretraining
         ddp_find_unused_parameters=True,
         # throughput_warmup_steps=2, # !!! ?