chaoyan commited on
Commit
fcc13f9
·
1 Parent(s): e65e319

Remove auth token

Browse files
Files changed (2) hide show
  1. run_cat.py +0 -2
  2. run_mlm_local.py +1 -8
run_cat.py CHANGED
@@ -3,7 +3,6 @@
3
  # ## 4. Pre-train BERT on processed dataset
4
 
5
  import os
6
- from huggingface_hub import HfFolder
7
 
8
  # hyperparameters
9
  hyperparameters = {
@@ -11,7 +10,6 @@ hyperparameters = {
11
  "dataset_id": "chaoyan/processed_bert_dataset",
12
  "tokenizer_id": "cat_tokenizer",
13
  "repository_id": "bert-base-uncased-cat",
14
- "hf_hub_token": HfFolder.get_token(), # need to be login in with `huggingface-cli login`
15
  "max_steps": 100_000,
16
  "per_device_train_batch_size": 16,
17
  "learning_rate": 5e-5,
 
3
  # ## 4. Pre-train BERT on processed dataset
4
 
5
  import os
 
6
 
7
  # hyperparameters
8
  hyperparameters = {
 
10
  "dataset_id": "chaoyan/processed_bert_dataset",
11
  "tokenizer_id": "cat_tokenizer",
12
  "repository_id": "bert-base-uncased-cat",
 
13
  "max_steps": 100_000,
14
  "per_device_train_batch_size": 16,
15
  "learning_rate": 5e-5,
run_mlm_local.py CHANGED
@@ -42,10 +42,6 @@ class ScriptArguments:
42
  default=None,
43
  metadata={"help": "The repository id where the model will be saved or loaded from for futher pre-training."},
44
  )
45
- hf_hub_token: str = field(
46
- default=False,
47
- metadata={"help": "The Token used to push models, metrics and logs to the Hub."},
48
- )
49
  model_config_id: Optional[str] = field(
50
  default="bert-base-uncased", metadata={"help": "Pretrained config name or path if not the same as model_name"}
51
  )
@@ -76,7 +72,7 @@ def run_mlm():
76
  # load processed dataset
77
  train_dataset = load_dataset(script_args.dataset_id, split="train")
78
  # load trained tokenizer
79
- tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id, use_auth_token=script_args.hf_hub_token)
80
 
81
  # load model from config (for training from scratch)
82
  logger.info("Training new model from scratch")
@@ -106,9 +102,6 @@ def run_mlm():
106
  save_steps=5_000,
107
  save_total_limit=2,
108
  report_to="tensorboard",
109
- # push to hub parameters
110
- # hub_strategy="every_save",
111
- # hub_model_id=script_args.repository_id,
112
  # pretraining
113
  ddp_find_unused_parameters=True,
114
  # throughput_warmup_steps=2, # !!! ?
 
42
  default=None,
43
  metadata={"help": "The repository id where the model will be saved or loaded from for futher pre-training."},
44
  )
 
 
 
 
45
  model_config_id: Optional[str] = field(
46
  default="bert-base-uncased", metadata={"help": "Pretrained config name or path if not the same as model_name"}
47
  )
 
72
  # load processed dataset
73
  train_dataset = load_dataset(script_args.dataset_id, split="train")
74
  # load trained tokenizer
75
+ tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id)
76
 
77
  # load model from config (for training from scratch)
78
  logger.info("Training new model from scratch")
 
102
  save_steps=5_000,
103
  save_total_limit=2,
104
  report_to="tensorboard",
 
 
 
105
  # pretraining
106
  ddp_find_unused_parameters=True,
107
  # throughput_warmup_steps=2, # !!! ?