Alaaeldin commited on
Commit
cfcf87a
·
verified ·
1 Parent(s): 93bf619

Delete train.py

Browse files
Files changed (1) hide show
  1. train.py +0 -234
train.py DELETED
@@ -1,234 +0,0 @@
1
- from datasets import load_dataset, load_metric
2
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
3
- import os
4
- import logging
5
- import numpy as np
6
- import torch
7
- from tqdm.auto import tqdm
8
-
9
- # Set up logging
10
- logging.basicConfig(
11
- level=logging.INFO,
12
- format='%(asctime)s - %(levelname)s - %(message)s',
13
- handlers=[
14
- logging.FileHandler('training.log'),
15
- logging.StreamHandler()
16
- ]
17
- )
18
- logger = logging.getLogger(__name__)
19
-
20
- # Set up cache directory and token
21
- os.environ["HF_HOME"] = "/tmp/cache"
22
- os.makedirs("/tmp/cache", exist_ok=True)
23
-
24
- # Get Hugging Face token securely
25
- HF_TOKEN = os.getenv("HF_TOKEN")
26
- if HF_TOKEN is None:
27
- raise ValueError("Hugging Face access token not found. Set it in the environment as 'HF_TOKEN'")
28
-
29
- MODEL_HUB_ID = "Alaaeldin/example-model" # Replace with your Hugging Face username
30
- BASE_MODEL = "deepset/roberta-base-squad2"
31
-
32
- class ModelTrainer:
33
- def __init__(self):
34
- self.metric = load_metric("squad")
35
- self.tokenizer = None
36
- self.model = None
37
-
38
- def load_tokenizer_and_model(self):
39
- """Load the tokenizer and model with error handling"""
40
- try:
41
- logger.info(f"Loading tokenizer and model from {BASE_MODEL}")
42
- self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
43
- self.model = AutoModelForQuestionAnswering.from_pretrained(BASE_MODEL)
44
- return True
45
- except Exception as e:
46
- logger.error(f"Error loading tokenizer and model: {e}")
47
- raise
48
-
49
- def preprocess_function(self, examples):
50
- """Preprocess the dataset examples"""
51
- try:
52
- tokenized_examples = self.tokenizer(
53
- examples["question"],
54
- examples["context"],
55
- truncation=True,
56
- max_length=384,
57
- stride=128,
58
- return_overflowing_tokens=True,
59
- return_offsets_mapping=True,
60
- padding="max_length",
61
- )
62
-
63
- sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
64
- tokenized_examples["start_positions"] = []
65
- tokenized_examples["end_positions"] = []
66
-
67
- for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
68
- sample_idx = sample_mapping[i]
69
- answers = examples["answers"][sample_idx]
70
-
71
- # Default values
72
- start_position = 0
73
- end_position = 0
74
-
75
- if len(answers["answer_start"]) > 0 and len(answers["text"]) > 0:
76
- start_char = answers["answer_start"][0]
77
- end_char = start_char + len(answers["text"][0])
78
-
79
- # Find token positions
80
- token_start_index = 0
81
- token_end_index = len(offsets) - 1
82
-
83
- # Find start position
84
- while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
85
- token_start_index += 1
86
- token_start_index -= 1
87
-
88
- # Find end position
89
- while token_end_index > 0 and offsets[token_end_index][1] >= end_char:
90
- token_end_index -= 1
91
- token_end_index += 1
92
-
93
- if 0 <= token_start_index <= token_end_index < len(offsets):
94
- start_position = token_start_index
95
- end_position = token_end_index
96
-
97
- tokenized_examples["start_positions"].append(start_position)
98
- tokenized_examples["end_positions"].append(end_position)
99
-
100
- return tokenized_examples
101
- except Exception as e:
102
- logger.error(f"Error in preprocessing: {e}")
103
- raise
104
-
105
- def compute_metrics(self, eval_pred):
106
- """Compute evaluation metrics"""
107
- predictions, labels = eval_pred
108
- start_logits, end_logits = predictions
109
-
110
- start_predictions = np.argmax(start_logits, axis=-1)
111
- end_predictions = np.argmax(end_logits, axis=-1)
112
-
113
- results = self.metric.compute(
114
- predictions={
115
- "start_positions": start_predictions,
116
- "end_positions": end_predictions
117
- },
118
- references={
119
- "start_positions": labels[0],
120
- "end_positions": labels[1]
121
- }
122
- )
123
- return results
124
-
125
- def validate_model_outputs(self, model, tokenizer):
126
- """Validate model outputs with a test example"""
127
- logger.info("Validating model outputs...")
128
- try:
129
- test_question = "What is the capital of France?"
130
- test_context = "Paris is the capital of France."
131
-
132
- inputs = tokenizer(
133
- test_question,
134
- test_context,
135
- return_tensors="pt",
136
- truncation=True,
137
- max_length=384,
138
- padding="max_length"
139
- )
140
-
141
- outputs = model(**inputs)
142
-
143
- if not (isinstance(outputs.start_logits, torch.Tensor) and
144
- isinstance(outputs.end_logits, torch.Tensor)):
145
- raise ValueError("Model outputs validation failed")
146
-
147
- logger.info("Model validation successful!")
148
- return True
149
- except Exception as e:
150
- logger.error(f"Model validation failed: {e}")
151
- raise
152
-
153
- def train(self):
154
- """Main training function"""
155
- try:
156
- logger.info("Starting training pipeline...")
157
-
158
- # Load dataset with a smaller subset
159
- logger.info("Loading SQuAD dataset...")
160
- dataset = load_dataset("squad", split={
161
- 'train': 'train[:1000]',
162
- 'validation': 'validation[:100]'
163
- })
164
-
165
- # Load tokenizer and model
166
- self.load_tokenizer_and_model()
167
-
168
- # Preprocess dataset
169
- logger.info("Preprocessing dataset...")
170
- tokenized_dataset = dataset.map(
171
- self.preprocess_function,
172
- batched=True,
173
- remove_columns=dataset["train"].column_names,
174
- num_proc=2 # Reduced for Spaces
175
- )
176
-
177
- # Set up training arguments
178
- output_dir = "/tmp/results"
179
- os.makedirs(output_dir, exist_ok=True)
180
-
181
- training_args = TrainingArguments(
182
- output_dir=output_dir,
183
- evaluation_strategy="steps",
184
- eval_steps=100,
185
- save_strategy="steps",
186
- save_steps=100,
187
- learning_rate=3e-5,
188
- per_device_train_batch_size=4,
189
- per_device_eval_batch_size=4,
190
- num_train_epochs=1,
191
- weight_decay=0.01,
192
- load_best_model_at_end=True,
193
- metric_for_best_model="eval_loss",
194
- push_to_hub=True,
195
- hub_model_id=MODEL_HUB_ID,
196
- hub_token=HF_TOKEN,
197
- report_to=["tensorboard"],
198
- logging_dir="./logs",
199
- logging_steps=50,
200
- gradient_accumulation_steps=4,
201
- warmup_steps=100,
202
- )
203
-
204
- # Initialize trainer
205
- trainer = Trainer(
206
- model=self.model,
207
- args=training_args,
208
- train_dataset=tokenized_dataset["train"],
209
- eval_dataset=tokenized_dataset["validation"],
210
- compute_metrics=self.compute_metrics,
211
- )
212
-
213
- # Train the model
214
- logger.info("Starting training...")
215
- trainer.train()
216
-
217
- # Validate model
218
- self.validate_model_outputs(self.model, self.tokenizer)
219
-
220
- # Save and push to hub
221
- logger.info("Saving and pushing model to Hugging Face Hub...")
222
- trainer.save_model()
223
- self.model.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
224
- self.tokenizer.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
225
-
226
- logger.info("Training pipeline completed successfully!")
227
-
228
- except Exception as e:
229
- logger.error(f"Training pipeline failed: {e}")
230
- raise
231
-
232
- if __name__ == "__main__":
233
- trainer = ModelTrainer()
234
- trainer.train()