|
from datasets import load_dataset, load_metric |
|
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer |
|
import os |
|
import logging |
|
import numpy as np |
|
import torch |
|
from tqdm.auto import tqdm |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler('training.log'), |
|
logging.StreamHandler() |
|
] |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
os.environ["HF_HOME"] = "/tmp/cache" |
|
os.makedirs("/tmp/cache", exist_ok=True) |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
if HF_TOKEN is None: |
|
raise ValueError("Hugging Face access token not found. Set it in the environment as 'HF_TOKEN'") |
|
|
|
MODEL_HUB_ID = "Alaaeldin/example-model" |
|
BASE_MODEL = "deepset/roberta-base-squad2" |
|
|
|
class ModelTrainer: |
|
def __init__(self): |
|
self.metric = load_metric("squad") |
|
self.tokenizer = None |
|
self.model = None |
|
|
|
def load_tokenizer_and_model(self): |
|
"""Load the tokenizer and model with error handling""" |
|
try: |
|
logger.info(f"Loading tokenizer and model from {BASE_MODEL}") |
|
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
|
self.model = AutoModelForQuestionAnswering.from_pretrained(BASE_MODEL) |
|
return True |
|
except Exception as e: |
|
logger.error(f"Error loading tokenizer and model: {e}") |
|
raise |
|
|
|
def preprocess_function(self, examples): |
|
"""Preprocess the dataset examples""" |
|
try: |
|
tokenized_examples = self.tokenizer( |
|
examples["question"], |
|
examples["context"], |
|
truncation=True, |
|
max_length=384, |
|
stride=128, |
|
return_overflowing_tokens=True, |
|
return_offsets_mapping=True, |
|
padding="max_length", |
|
) |
|
|
|
sample_mapping = tokenized_examples["overflow_to_sample_mapping"] |
|
tokenized_examples["start_positions"] = [] |
|
tokenized_examples["end_positions"] = [] |
|
|
|
for i, offsets in enumerate(tokenized_examples["offset_mapping"]): |
|
sample_idx = sample_mapping[i] |
|
answers = examples["answers"][sample_idx] |
|
|
|
|
|
start_position = 0 |
|
end_position = 0 |
|
|
|
if len(answers["answer_start"]) > 0 and len(answers["text"]) > 0: |
|
start_char = answers["answer_start"][0] |
|
end_char = start_char + len(answers["text"][0]) |
|
|
|
|
|
token_start_index = 0 |
|
token_end_index = len(offsets) - 1 |
|
|
|
|
|
while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: |
|
token_start_index += 1 |
|
token_start_index -= 1 |
|
|
|
|
|
while token_end_index > 0 and offsets[token_end_index][1] >= end_char: |
|
token_end_index -= 1 |
|
token_end_index += 1 |
|
|
|
if 0 <= token_start_index <= token_end_index < len(offsets): |
|
start_position = token_start_index |
|
end_position = token_end_index |
|
|
|
tokenized_examples["start_positions"].append(start_position) |
|
tokenized_examples["end_positions"].append(end_position) |
|
|
|
return tokenized_examples |
|
except Exception as e: |
|
logger.error(f"Error in preprocessing: {e}") |
|
raise |
|
|
|
def compute_metrics(self, eval_pred): |
|
"""Compute evaluation metrics""" |
|
predictions, labels = eval_pred |
|
start_logits, end_logits = predictions |
|
|
|
start_predictions = np.argmax(start_logits, axis=-1) |
|
end_predictions = np.argmax(end_logits, axis=-1) |
|
|
|
results = self.metric.compute( |
|
predictions={ |
|
"start_positions": start_predictions, |
|
"end_positions": end_predictions |
|
}, |
|
references={ |
|
"start_positions": labels[0], |
|
"end_positions": labels[1] |
|
} |
|
) |
|
return results |
|
|
|
def validate_model_outputs(self, model, tokenizer): |
|
"""Validate model outputs with a test example""" |
|
logger.info("Validating model outputs...") |
|
try: |
|
test_question = "What is the capital of France?" |
|
test_context = "Paris is the capital of France." |
|
|
|
inputs = tokenizer( |
|
test_question, |
|
test_context, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=384, |
|
padding="max_length" |
|
) |
|
|
|
outputs = model(**inputs) |
|
|
|
if not (isinstance(outputs.start_logits, torch.Tensor) and |
|
isinstance(outputs.end_logits, torch.Tensor)): |
|
raise ValueError("Model outputs validation failed") |
|
|
|
logger.info("Model validation successful!") |
|
return True |
|
except Exception as e: |
|
logger.error(f"Model validation failed: {e}") |
|
raise |
|
|
|
def train(self): |
|
"""Main training function""" |
|
try: |
|
logger.info("Starting training pipeline...") |
|
|
|
|
|
logger.info("Loading SQuAD dataset...") |
|
dataset = load_dataset("squad", split={ |
|
'train': 'train[:1000]', |
|
'validation': 'validation[:100]' |
|
}) |
|
|
|
|
|
self.load_tokenizer_and_model() |
|
|
|
|
|
logger.info("Preprocessing dataset...") |
|
tokenized_dataset = dataset.map( |
|
self.preprocess_function, |
|
batched=True, |
|
remove_columns=dataset["train"].column_names, |
|
num_proc=2 |
|
) |
|
|
|
|
|
output_dir = "/tmp/results" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
training_args = TrainingArguments( |
|
output_dir=output_dir, |
|
evaluation_strategy="steps", |
|
eval_steps=100, |
|
save_strategy="steps", |
|
save_steps=100, |
|
learning_rate=3e-5, |
|
per_device_train_batch_size=4, |
|
per_device_eval_batch_size=4, |
|
num_train_epochs=1, |
|
weight_decay=0.01, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="eval_loss", |
|
push_to_hub=True, |
|
hub_model_id=MODEL_HUB_ID, |
|
hub_token=HF_TOKEN, |
|
report_to=["tensorboard"], |
|
logging_dir="./logs", |
|
logging_steps=50, |
|
gradient_accumulation_steps=4, |
|
warmup_steps=100, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=self.model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset["train"], |
|
eval_dataset=tokenized_dataset["validation"], |
|
compute_metrics=self.compute_metrics, |
|
) |
|
|
|
|
|
logger.info("Starting training...") |
|
trainer.train() |
|
|
|
|
|
self.validate_model_outputs(self.model, self.tokenizer) |
|
|
|
|
|
logger.info("Saving and pushing model to Hugging Face Hub...") |
|
trainer.save_model() |
|
self.model.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN) |
|
self.tokenizer.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN) |
|
|
|
logger.info("Training pipeline completed successfully!") |
|
|
|
except Exception as e: |
|
logger.error(f"Training pipeline failed: {e}") |
|
raise |
|
|
|
if __name__ == "__main__": |
|
trainer = ModelTrainer() |
|
trainer.train() |