Gabriel Okiri
Initial commit
4bb9d41
import torch
from transformers import Trainer, TrainingArguments
from app.model.model import NigerianLanguageModel
from app.model.config import ModelConfig
def train_model(model: NigerianLanguageModel, train_dataset, eval_dataset=None):
training_args = TrainingArguments(
output_dir="outputs",
num_train_epochs=model.config.num_train_epochs,
per_device_train_batch_size=model.config.batch_size,
learning_rate=model.config.learning_rate,
save_steps=500,
)
trainer = Trainer(
model=model.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()
# scripts/preprocess.py
from app.utils.data_preprocessing import load_language_data, preprocess_text
import os
def main():
languages = ["yoruba", "igbo", "hausa"]
for lang in languages:
data = load_language_data("data/raw", lang)
processed_data = [preprocess_text(text) for text in data]
output_dir = f"data/processed/{lang}"
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/processed_texts.txt", 'w', encoding='utf-8') as f:
f.writelines(processed_data)