import torch from transformers import Trainer, TrainingArguments from app.model.model import NigerianLanguageModel from app.model.config import ModelConfig def train_model(model: NigerianLanguageModel, train_dataset, eval_dataset=None): training_args = TrainingArguments( output_dir="outputs", num_train_epochs=model.config.num_train_epochs, per_device_train_batch_size=model.config.batch_size, learning_rate=model.config.learning_rate, save_steps=500, ) trainer = Trainer( model=model.model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset ) trainer.train() # scripts/preprocess.py from app.utils.data_preprocessing import load_language_data, preprocess_text import os def main(): languages = ["yoruba", "igbo", "hausa"] for lang in languages: data = load_language_data("data/raw", lang) processed_data = [preprocess_text(text) for text in data] output_dir = f"data/processed/{lang}" os.makedirs(output_dir, exist_ok=True) with open(f"{output_dir}/processed_texts.txt", 'w', encoding='utf-8') as f: f.writelines(processed_data)