example-model / train.py

Update train.py

93bf619 verified 7 months ago

8.54 kB

	from datasets import load_dataset, load_metric
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
	import os
	import logging
	import numpy as np
	import torch
	from tqdm.auto import tqdm

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler('training.log'),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	# Set up cache directory and token
	os.environ["HF_HOME"] = "/tmp/cache"
	os.makedirs("/tmp/cache", exist_ok=True)

	# Get Hugging Face token securely
	HF_TOKEN = os.getenv("HF_TOKEN")
	if HF_TOKEN is None:
	raise ValueError("Hugging Face access token not found. Set it in the environment as 'HF_TOKEN'")

	MODEL_HUB_ID = "Alaaeldin/example-model" # Replace with your Hugging Face username
	BASE_MODEL = "deepset/roberta-base-squad2"

	class ModelTrainer:
	def __init__(self):
	self.metric = load_metric("squad")
	self.tokenizer = None
	self.model = None

	def load_tokenizer_and_model(self):
	"""Load the tokenizer and model with error handling"""
	try:
	logger.info(f"Loading tokenizer and model from {BASE_MODEL}")
	self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
	self.model = AutoModelForQuestionAnswering.from_pretrained(BASE_MODEL)
	return True
	except Exception as e:
	logger.error(f"Error loading tokenizer and model: {e}")
	raise

	def preprocess_function(self, examples):
	"""Preprocess the dataset examples"""
	try:
	tokenized_examples = self.tokenizer(
	examples["question"],
	examples["context"],
	truncation=True,
	max_length=384,
	stride=128,
	return_overflowing_tokens=True,
	return_offsets_mapping=True,
	padding="max_length",
	)

	sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
	tokenized_examples["start_positions"] = []
	tokenized_examples["end_positions"] = []

	for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
	sample_idx = sample_mapping[i]
	answers = examples["answers"][sample_idx]

	# Default values
	start_position = 0
	end_position = 0

	if len(answers["answer_start"]) > 0 and len(answers["text"]) > 0:
	start_char = answers["answer_start"][0]
	end_char = start_char + len(answers["text"][0])

	# Find token positions
	token_start_index = 0
	token_end_index = len(offsets) - 1

	# Find start position
	while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
	token_start_index += 1
	token_start_index -= 1

	# Find end position
	while token_end_index > 0 and offsets[token_end_index][1] >= end_char:
	token_end_index -= 1
	token_end_index += 1

	if 0 <= token_start_index <= token_end_index < len(offsets):
	start_position = token_start_index
	end_position = token_end_index

	tokenized_examples["start_positions"].append(start_position)
	tokenized_examples["end_positions"].append(end_position)

	return tokenized_examples
	except Exception as e:
	logger.error(f"Error in preprocessing: {e}")
	raise

	def compute_metrics(self, eval_pred):
	"""Compute evaluation metrics"""
	predictions, labels = eval_pred
	start_logits, end_logits = predictions

	start_predictions = np.argmax(start_logits, axis=-1)
	end_predictions = np.argmax(end_logits, axis=-1)

	results = self.metric.compute(
	predictions={
	"start_positions": start_predictions,
	"end_positions": end_predictions
	},
	references={
	"start_positions": labels[0],
	"end_positions": labels[1]
	}
	)
	return results

	def validate_model_outputs(self, model, tokenizer):
	"""Validate model outputs with a test example"""
	logger.info("Validating model outputs...")
	try:
	test_question = "What is the capital of France?"
	test_context = "Paris is the capital of France."

	inputs = tokenizer(
	test_question,
	test_context,
	return_tensors="pt",
	truncation=True,
	max_length=384,
	padding="max_length"
	)

	outputs = model(**inputs)

	if not (isinstance(outputs.start_logits, torch.Tensor) and
	isinstance(outputs.end_logits, torch.Tensor)):
	raise ValueError("Model outputs validation failed")

	logger.info("Model validation successful!")
	return True
	except Exception as e:
	logger.error(f"Model validation failed: {e}")
	raise

	def train(self):
	"""Main training function"""
	try:
	logger.info("Starting training pipeline...")

	# Load dataset with a smaller subset
	logger.info("Loading SQuAD dataset...")
	dataset = load_dataset("squad", split={
	'train': 'train[:1000]',
	'validation': 'validation[:100]'
	})

	# Load tokenizer and model
	self.load_tokenizer_and_model()

	# Preprocess dataset
	logger.info("Preprocessing dataset...")
	tokenized_dataset = dataset.map(
	self.preprocess_function,
	batched=True,
	remove_columns=dataset["train"].column_names,
	num_proc=2 # Reduced for Spaces
	)

	# Set up training arguments
	output_dir = "/tmp/results"
	os.makedirs(output_dir, exist_ok=True)

	training_args = TrainingArguments(
	output_dir=output_dir,
	evaluation_strategy="steps",
	eval_steps=100,
	save_strategy="steps",
	save_steps=100,
	learning_rate=3e-5,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	num_train_epochs=1,
	weight_decay=0.01,
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss",
	push_to_hub=True,
	hub_model_id=MODEL_HUB_ID,
	hub_token=HF_TOKEN,
	report_to=["tensorboard"],
	logging_dir="./logs",
	logging_steps=50,
	gradient_accumulation_steps=4,
	warmup_steps=100,
	)

	# Initialize trainer
	trainer = Trainer(
	model=self.model,
	args=training_args,
	train_dataset=tokenized_dataset["train"],
	eval_dataset=tokenized_dataset["validation"],
	compute_metrics=self.compute_metrics,
	)

	# Train the model
	logger.info("Starting training...")
	trainer.train()

	# Validate model
	self.validate_model_outputs(self.model, self.tokenizer)

	# Save and push to hub
	logger.info("Saving and pushing model to Hugging Face Hub...")
	trainer.save_model()
	self.model.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
	self.tokenizer.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)

	logger.info("Training pipeline completed successfully!")

	except Exception as e:
	logger.error(f"Training pipeline failed: {e}")
	raise

	if __name__ == "__main__":
	trainer = ModelTrainer()
	trainer.train()