Alaaeldin commited on
Commit
93bf619
·
verified ·
1 Parent(s): 7e18680

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +227 -46
train.py CHANGED
@@ -1,53 +1,234 @@
1
- # File 1: Model Repo Code (train.py)
2
- # This file contains steps 1 to 4
3
-
4
- from datasets import load_dataset
5
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
 
 
 
 
 
6
 
7
- # Step 1: Load the Dataset
8
- dataset = load_dataset("squad")
9
-
10
- # Step 2: Preprocess the Dataset
11
- tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
12
-
13
- def preprocess_function(examples):
14
- return tokenizer(
15
- examples["question"],
16
- examples["context"],
17
- truncation=True,
18
- max_length=384,
19
- stride=128,
20
- return_overflowing_tokens=True,
21
- padding="max_length"
22
- )
23
-
24
- tokenized_dataset = dataset.map(preprocess_function, batched=True)
25
-
26
- # Step 3: Train the Model
27
- model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
28
-
29
- training_args = TrainingArguments(
30
- output_dir="./results",
31
- evaluation_strategy="epoch",
32
- learning_rate=3e-5,
33
- per_device_train_batch_size=16,
34
- num_train_epochs=3,
35
- weight_decay=0.01,
36
- push_to_hub=True, # Automatically push to the Hugging Face Hub
37
- hub_model_id="username/qa_model_repo" # Replace with your username and model repo name
38
  )
 
39
 
40
- trainer = Trainer(
41
- model=model,
42
- args=training_args,
43
- train_dataset=tokenized_dataset["train"],
44
- eval_dataset=tokenized_dataset["validation"],
45
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- # Step 4: Push the Model and Tokenizer to Hugging Face Hub
50
- model.push_to_hub("username/qa_model_repo")
51
- tokenizer.push_to_hub("username/qa_model_repo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- print("Model and tokenizer pushed to Hugging Face Hub successfully!")
 
 
 
1
+ from datasets import load_dataset, load_metric
 
 
 
2
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
3
+ import os
4
+ import logging
5
+ import numpy as np
6
+ import torch
7
+ from tqdm.auto import tqdm
8
 
9
+ # Set up logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s',
13
+ handlers=[
14
+ logging.FileHandler('training.log'),
15
+ logging.StreamHandler()
16
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  )
18
+ logger = logging.getLogger(__name__)
19
 
20
+ # Set up cache directory and token
21
+ os.environ["HF_HOME"] = "/tmp/cache"
22
+ os.makedirs("/tmp/cache", exist_ok=True)
23
+
24
+ # Get Hugging Face token securely
25
+ HF_TOKEN = os.getenv("HF_TOKEN")
26
+ if HF_TOKEN is None:
27
+ raise ValueError("Hugging Face access token not found. Set it in the environment as 'HF_TOKEN'")
28
+
29
+ MODEL_HUB_ID = "Alaaeldin/example-model" # Replace with your Hugging Face username
30
+ BASE_MODEL = "deepset/roberta-base-squad2"
31
+
32
+ class ModelTrainer:
33
+ def __init__(self):
34
+ self.metric = load_metric("squad")
35
+ self.tokenizer = None
36
+ self.model = None
37
+
38
+ def load_tokenizer_and_model(self):
39
+ """Load the tokenizer and model with error handling"""
40
+ try:
41
+ logger.info(f"Loading tokenizer and model from {BASE_MODEL}")
42
+ self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
43
+ self.model = AutoModelForQuestionAnswering.from_pretrained(BASE_MODEL)
44
+ return True
45
+ except Exception as e:
46
+ logger.error(f"Error loading tokenizer and model: {e}")
47
+ raise
48
+
49
+ def preprocess_function(self, examples):
50
+ """Preprocess the dataset examples"""
51
+ try:
52
+ tokenized_examples = self.tokenizer(
53
+ examples["question"],
54
+ examples["context"],
55
+ truncation=True,
56
+ max_length=384,
57
+ stride=128,
58
+ return_overflowing_tokens=True,
59
+ return_offsets_mapping=True,
60
+ padding="max_length",
61
+ )
62
+
63
+ sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
64
+ tokenized_examples["start_positions"] = []
65
+ tokenized_examples["end_positions"] = []
66
+
67
+ for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
68
+ sample_idx = sample_mapping[i]
69
+ answers = examples["answers"][sample_idx]
70
+
71
+ # Default values
72
+ start_position = 0
73
+ end_position = 0
74
+
75
+ if len(answers["answer_start"]) > 0 and len(answers["text"]) > 0:
76
+ start_char = answers["answer_start"][0]
77
+ end_char = start_char + len(answers["text"][0])
78
+
79
+ # Find token positions
80
+ token_start_index = 0
81
+ token_end_index = len(offsets) - 1
82
+
83
+ # Find start position
84
+ while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
85
+ token_start_index += 1
86
+ token_start_index -= 1
87
+
88
+ # Find end position
89
+ while token_end_index > 0 and offsets[token_end_index][1] >= end_char:
90
+ token_end_index -= 1
91
+ token_end_index += 1
92
+
93
+ if 0 <= token_start_index <= token_end_index < len(offsets):
94
+ start_position = token_start_index
95
+ end_position = token_end_index
96
+
97
+ tokenized_examples["start_positions"].append(start_position)
98
+ tokenized_examples["end_positions"].append(end_position)
99
+
100
+ return tokenized_examples
101
+ except Exception as e:
102
+ logger.error(f"Error in preprocessing: {e}")
103
+ raise
104
+
105
+ def compute_metrics(self, eval_pred):
106
+ """Compute evaluation metrics"""
107
+ predictions, labels = eval_pred
108
+ start_logits, end_logits = predictions
109
+
110
+ start_predictions = np.argmax(start_logits, axis=-1)
111
+ end_predictions = np.argmax(end_logits, axis=-1)
112
+
113
+ results = self.metric.compute(
114
+ predictions={
115
+ "start_positions": start_predictions,
116
+ "end_positions": end_predictions
117
+ },
118
+ references={
119
+ "start_positions": labels[0],
120
+ "end_positions": labels[1]
121
+ }
122
+ )
123
+ return results
124
 
125
+ def validate_model_outputs(self, model, tokenizer):
126
+ """Validate model outputs with a test example"""
127
+ logger.info("Validating model outputs...")
128
+ try:
129
+ test_question = "What is the capital of France?"
130
+ test_context = "Paris is the capital of France."
131
+
132
+ inputs = tokenizer(
133
+ test_question,
134
+ test_context,
135
+ return_tensors="pt",
136
+ truncation=True,
137
+ max_length=384,
138
+ padding="max_length"
139
+ )
140
+
141
+ outputs = model(**inputs)
142
+
143
+ if not (isinstance(outputs.start_logits, torch.Tensor) and
144
+ isinstance(outputs.end_logits, torch.Tensor)):
145
+ raise ValueError("Model outputs validation failed")
146
+
147
+ logger.info("Model validation successful!")
148
+ return True
149
+ except Exception as e:
150
+ logger.error(f"Model validation failed: {e}")
151
+ raise
152
 
153
+ def train(self):
154
+ """Main training function"""
155
+ try:
156
+ logger.info("Starting training pipeline...")
157
+
158
+ # Load dataset with a smaller subset
159
+ logger.info("Loading SQuAD dataset...")
160
+ dataset = load_dataset("squad", split={
161
+ 'train': 'train[:1000]',
162
+ 'validation': 'validation[:100]'
163
+ })
164
+
165
+ # Load tokenizer and model
166
+ self.load_tokenizer_and_model()
167
+
168
+ # Preprocess dataset
169
+ logger.info("Preprocessing dataset...")
170
+ tokenized_dataset = dataset.map(
171
+ self.preprocess_function,
172
+ batched=True,
173
+ remove_columns=dataset["train"].column_names,
174
+ num_proc=2 # Reduced for Spaces
175
+ )
176
+
177
+ # Set up training arguments
178
+ output_dir = "/tmp/results"
179
+ os.makedirs(output_dir, exist_ok=True)
180
+
181
+ training_args = TrainingArguments(
182
+ output_dir=output_dir,
183
+ evaluation_strategy="steps",
184
+ eval_steps=100,
185
+ save_strategy="steps",
186
+ save_steps=100,
187
+ learning_rate=3e-5,
188
+ per_device_train_batch_size=4,
189
+ per_device_eval_batch_size=4,
190
+ num_train_epochs=1,
191
+ weight_decay=0.01,
192
+ load_best_model_at_end=True,
193
+ metric_for_best_model="eval_loss",
194
+ push_to_hub=True,
195
+ hub_model_id=MODEL_HUB_ID,
196
+ hub_token=HF_TOKEN,
197
+ report_to=["tensorboard"],
198
+ logging_dir="./logs",
199
+ logging_steps=50,
200
+ gradient_accumulation_steps=4,
201
+ warmup_steps=100,
202
+ )
203
+
204
+ # Initialize trainer
205
+ trainer = Trainer(
206
+ model=self.model,
207
+ args=training_args,
208
+ train_dataset=tokenized_dataset["train"],
209
+ eval_dataset=tokenized_dataset["validation"],
210
+ compute_metrics=self.compute_metrics,
211
+ )
212
+
213
+ # Train the model
214
+ logger.info("Starting training...")
215
+ trainer.train()
216
+
217
+ # Validate model
218
+ self.validate_model_outputs(self.model, self.tokenizer)
219
+
220
+ # Save and push to hub
221
+ logger.info("Saving and pushing model to Hugging Face Hub...")
222
+ trainer.save_model()
223
+ self.model.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
224
+ self.tokenizer.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
225
+
226
+ logger.info("Training pipeline completed successfully!")
227
+
228
+ except Exception as e:
229
+ logger.error(f"Training pipeline failed: {e}")
230
+ raise
231
 
232
+ if __name__ == "__main__":
233
+ trainer = ModelTrainer()
234
+ trainer.train()