example-model / train.py
Alaaeldin's picture
Update train.py
93bf619 verified
raw
history blame
8.54 kB
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import os
import logging
import numpy as np
import torch
from tqdm.auto import tqdm
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('training.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Set up cache directory and token
os.environ["HF_HOME"] = "/tmp/cache"
os.makedirs("/tmp/cache", exist_ok=True)
# Get Hugging Face token securely
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
raise ValueError("Hugging Face access token not found. Set it in the environment as 'HF_TOKEN'")
MODEL_HUB_ID = "Alaaeldin/example-model" # Replace with your Hugging Face username
BASE_MODEL = "deepset/roberta-base-squad2"
class ModelTrainer:
def __init__(self):
self.metric = load_metric("squad")
self.tokenizer = None
self.model = None
def load_tokenizer_and_model(self):
"""Load the tokenizer and model with error handling"""
try:
logger.info(f"Loading tokenizer and model from {BASE_MODEL}")
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
self.model = AutoModelForQuestionAnswering.from_pretrained(BASE_MODEL)
return True
except Exception as e:
logger.error(f"Error loading tokenizer and model: {e}")
raise
def preprocess_function(self, examples):
"""Preprocess the dataset examples"""
try:
tokenized_examples = self.tokenizer(
examples["question"],
examples["context"],
truncation=True,
max_length=384,
stride=128,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
sample_idx = sample_mapping[i]
answers = examples["answers"][sample_idx]
# Default values
start_position = 0
end_position = 0
if len(answers["answer_start"]) > 0 and len(answers["text"]) > 0:
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])
# Find token positions
token_start_index = 0
token_end_index = len(offsets) - 1
# Find start position
while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
token_start_index += 1
token_start_index -= 1
# Find end position
while token_end_index > 0 and offsets[token_end_index][1] >= end_char:
token_end_index -= 1
token_end_index += 1
if 0 <= token_start_index <= token_end_index < len(offsets):
start_position = token_start_index
end_position = token_end_index
tokenized_examples["start_positions"].append(start_position)
tokenized_examples["end_positions"].append(end_position)
return tokenized_examples
except Exception as e:
logger.error(f"Error in preprocessing: {e}")
raise
def compute_metrics(self, eval_pred):
"""Compute evaluation metrics"""
predictions, labels = eval_pred
start_logits, end_logits = predictions
start_predictions = np.argmax(start_logits, axis=-1)
end_predictions = np.argmax(end_logits, axis=-1)
results = self.metric.compute(
predictions={
"start_positions": start_predictions,
"end_positions": end_predictions
},
references={
"start_positions": labels[0],
"end_positions": labels[1]
}
)
return results
def validate_model_outputs(self, model, tokenizer):
"""Validate model outputs with a test example"""
logger.info("Validating model outputs...")
try:
test_question = "What is the capital of France?"
test_context = "Paris is the capital of France."
inputs = tokenizer(
test_question,
test_context,
return_tensors="pt",
truncation=True,
max_length=384,
padding="max_length"
)
outputs = model(**inputs)
if not (isinstance(outputs.start_logits, torch.Tensor) and
isinstance(outputs.end_logits, torch.Tensor)):
raise ValueError("Model outputs validation failed")
logger.info("Model validation successful!")
return True
except Exception as e:
logger.error(f"Model validation failed: {e}")
raise
def train(self):
"""Main training function"""
try:
logger.info("Starting training pipeline...")
# Load dataset with a smaller subset
logger.info("Loading SQuAD dataset...")
dataset = load_dataset("squad", split={
'train': 'train[:1000]',
'validation': 'validation[:100]'
})
# Load tokenizer and model
self.load_tokenizer_and_model()
# Preprocess dataset
logger.info("Preprocessing dataset...")
tokenized_dataset = dataset.map(
self.preprocess_function,
batched=True,
remove_columns=dataset["train"].column_names,
num_proc=2 # Reduced for Spaces
)
# Set up training arguments
output_dir = "/tmp/results"
os.makedirs(output_dir, exist_ok=True)
training_args = TrainingArguments(
output_dir=output_dir,
evaluation_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
learning_rate=3e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=1,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
push_to_hub=True,
hub_model_id=MODEL_HUB_ID,
hub_token=HF_TOKEN,
report_to=["tensorboard"],
logging_dir="./logs",
logging_steps=50,
gradient_accumulation_steps=4,
warmup_steps=100,
)
# Initialize trainer
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
compute_metrics=self.compute_metrics,
)
# Train the model
logger.info("Starting training...")
trainer.train()
# Validate model
self.validate_model_outputs(self.model, self.tokenizer)
# Save and push to hub
logger.info("Saving and pushing model to Hugging Face Hub...")
trainer.save_model()
self.model.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
self.tokenizer.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
logger.info("Training pipeline completed successfully!")
except Exception as e:
logger.error(f"Training pipeline failed: {e}")
raise
if __name__ == "__main__":
trainer = ModelTrainer()
trainer.train()