Reyad-Ahmmed commited on
Commit
d14cc37
·
verified ·
1 Parent(s): 3206374

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -31
app.py CHANGED
@@ -50,20 +50,6 @@ if num_args == 6:
50
  batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
51
  should_produce_eval_matrix = int(arg5) # should produce matrix?
52
  path_to_save_trained_model_to = arg6
53
-
54
- print(f"should train model? : {arg1}")
55
- print (f"file to train on : {arg2}")
56
- print (f"file to evaluate on : {arg3}")
57
- print (f"batch size : {arg4}")
58
- print (f"should produce eval matrix : {arg5}")
59
- print (f"path to save trained model : {arg6}")
60
-
61
- print(f"should train model? : {should_train_model}")
62
- print (f"file to train on : {train_file}")
63
- print (f"file to evaluate on : {test_file}")
64
- print (f"batch size : {batch_size_for_trainer}")
65
- print (f"should produce eval matrix : {should_produce_eval_matrix}")
66
- print (f"path to save trained model : {path_to_save_trained_model_to}")
67
 
68
  else:
69
  print(f"Only {num_args-1} arguments after filename were passed out of 6")
@@ -101,8 +87,6 @@ if (should_train_model=='1'): #train model
101
 
102
  repo_name = "Reyad-Ahmmed/hf-data-timeframe"
103
 
104
- # Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
105
- # tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
106
  tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
107
  # I made sure to add all the ones in the training and eval data to this list
108
  # since we are training using data that only contains the left tag - we don't need right tags added to this list
@@ -112,8 +96,6 @@ if (should_train_model=='1'): #train model
112
 
113
  # Model
114
  model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
115
- # model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
116
-
117
 
118
  # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
119
  model.resize_token_embeddings(len(tokenizer))
@@ -153,8 +135,6 @@ if (should_train_model=='1'): #train model
153
  emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
154
  emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
155
 
156
-
157
-
158
  # Step 4: Split dataset into train and validation
159
  # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
160
  # and one for "validation" with test dataset)
@@ -163,12 +143,10 @@ if (should_train_model=='1'): #train model
163
  'validation': emotions_dataset_test
164
  })
165
 
166
-
167
  # Define the tokenize function
168
  def tokenize(batch):
169
  return tokenizer(batch["text"], padding=True, truncation=True)
170
 
171
-
172
  # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
173
  # this will add the "input_id" and "attention_mask" columns
174
  emotions_encoded = emotions_encoded.map(tokenize, batched=True)
@@ -223,15 +201,6 @@ if (should_train_model=='1'): #train model
223
 
224
  return (loss, outputs) if return_outputs else loss
225
 
226
-
227
- # trainer = CustomTrainer(
228
- # model=model,
229
- # compute_metrics=compute_metrics,
230
- # args=training_args,
231
- # train_dataset=emotions_encoded["train"],
232
- # eval_dataset=emotions_encoded["validation"],
233
- # tokenizer=tokenizer )
234
-
235
  trainer = Trainer(
236
  model=model,
237
  args=training_args,
 
50
  batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
51
  should_produce_eval_matrix = int(arg5) # should produce matrix?
52
  path_to_save_trained_model_to = arg6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  else:
55
  print(f"Only {num_args-1} arguments after filename were passed out of 6")
 
87
 
88
  repo_name = "Reyad-Ahmmed/hf-data-timeframe"
89
 
 
 
90
  tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
91
  # I made sure to add all the ones in the training and eval data to this list
92
  # since we are training using data that only contains the left tag - we don't need right tags added to this list
 
96
 
97
  # Model
98
  model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
 
 
99
 
100
  # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
101
  model.resize_token_embeddings(len(tokenizer))
 
135
  emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
136
  emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
137
 
 
 
138
  # Step 4: Split dataset into train and validation
139
  # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
140
  # and one for "validation" with test dataset)
 
143
  'validation': emotions_dataset_test
144
  })
145
 
 
146
  # Define the tokenize function
147
  def tokenize(batch):
148
  return tokenizer(batch["text"], padding=True, truncation=True)
149
 
 
150
  # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
151
  # this will add the "input_id" and "attention_mask" columns
152
  emotions_encoded = emotions_encoded.map(tokenize, batched=True)
 
201
 
202
  return (loss, outputs) if return_outputs else loss
203
 
 
 
 
 
 
 
 
 
 
204
  trainer = Trainer(
205
  model=model,
206
  args=training_args,