Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -50,20 +50,6 @@ if num_args == 6:
|
|
50 |
batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
|
51 |
should_produce_eval_matrix = int(arg5) # should produce matrix?
|
52 |
path_to_save_trained_model_to = arg6
|
53 |
-
|
54 |
-
print(f"should train model? : {arg1}")
|
55 |
-
print (f"file to train on : {arg2}")
|
56 |
-
print (f"file to evaluate on : {arg3}")
|
57 |
-
print (f"batch size : {arg4}")
|
58 |
-
print (f"should produce eval matrix : {arg5}")
|
59 |
-
print (f"path to save trained model : {arg6}")
|
60 |
-
|
61 |
-
print(f"should train model? : {should_train_model}")
|
62 |
-
print (f"file to train on : {train_file}")
|
63 |
-
print (f"file to evaluate on : {test_file}")
|
64 |
-
print (f"batch size : {batch_size_for_trainer}")
|
65 |
-
print (f"should produce eval matrix : {should_produce_eval_matrix}")
|
66 |
-
print (f"path to save trained model : {path_to_save_trained_model_to}")
|
67 |
|
68 |
else:
|
69 |
print(f"Only {num_args-1} arguments after filename were passed out of 6")
|
@@ -101,8 +87,6 @@ if (should_train_model=='1'): #train model
|
|
101 |
|
102 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
103 |
|
104 |
-
# Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
|
105 |
-
# tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
|
106 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
107 |
# I made sure to add all the ones in the training and eval data to this list
|
108 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
@@ -112,8 +96,6 @@ if (should_train_model=='1'): #train model
|
|
112 |
|
113 |
# Model
|
114 |
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
|
115 |
-
# model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
|
116 |
-
|
117 |
|
118 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
119 |
model.resize_token_embeddings(len(tokenizer))
|
@@ -153,8 +135,6 @@ if (should_train_model=='1'): #train model
|
|
153 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
154 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
155 |
|
156 |
-
|
157 |
-
|
158 |
# Step 4: Split dataset into train and validation
|
159 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
160 |
# and one for "validation" with test dataset)
|
@@ -163,12 +143,10 @@ if (should_train_model=='1'): #train model
|
|
163 |
'validation': emotions_dataset_test
|
164 |
})
|
165 |
|
166 |
-
|
167 |
# Define the tokenize function
|
168 |
def tokenize(batch):
|
169 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
170 |
|
171 |
-
|
172 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
173 |
# this will add the "input_id" and "attention_mask" columns
|
174 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
@@ -223,15 +201,6 @@ if (should_train_model=='1'): #train model
|
|
223 |
|
224 |
return (loss, outputs) if return_outputs else loss
|
225 |
|
226 |
-
|
227 |
-
# trainer = CustomTrainer(
|
228 |
-
# model=model,
|
229 |
-
# compute_metrics=compute_metrics,
|
230 |
-
# args=training_args,
|
231 |
-
# train_dataset=emotions_encoded["train"],
|
232 |
-
# eval_dataset=emotions_encoded["validation"],
|
233 |
-
# tokenizer=tokenizer )
|
234 |
-
|
235 |
trainer = Trainer(
|
236 |
model=model,
|
237 |
args=training_args,
|
|
|
50 |
batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
|
51 |
should_produce_eval_matrix = int(arg5) # should produce matrix?
|
52 |
path_to_save_trained_model_to = arg6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
else:
|
55 |
print(f"Only {num_args-1} arguments after filename were passed out of 6")
|
|
|
87 |
|
88 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
89 |
|
|
|
|
|
90 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
91 |
# I made sure to add all the ones in the training and eval data to this list
|
92 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
|
|
96 |
|
97 |
# Model
|
98 |
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
|
|
|
|
|
99 |
|
100 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
101 |
model.resize_token_embeddings(len(tokenizer))
|
|
|
135 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
136 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
137 |
|
|
|
|
|
138 |
# Step 4: Split dataset into train and validation
|
139 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
140 |
# and one for "validation" with test dataset)
|
|
|
143 |
'validation': emotions_dataset_test
|
144 |
})
|
145 |
|
|
|
146 |
# Define the tokenize function
|
147 |
def tokenize(batch):
|
148 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
149 |
|
|
|
150 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
151 |
# this will add the "input_id" and "attention_mask" columns
|
152 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
|
|
201 |
|
202 |
return (loss, outputs) if return_outputs else loss
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
trainer = Trainer(
|
205 |
model=model,
|
206 |
args=training_args,
|