Spaces:
Sleeping
Sleeping
import torch | |
from transformers import Trainer, TrainingArguments | |
from app.model.model import NigerianLanguageModel | |
from app.model.config import ModelConfig | |
def train_model(model: NigerianLanguageModel, train_dataset, eval_dataset=None): | |
training_args = TrainingArguments( | |
output_dir="outputs", | |
num_train_epochs=model.config.num_train_epochs, | |
per_device_train_batch_size=model.config.batch_size, | |
learning_rate=model.config.learning_rate, | |
save_steps=500, | |
) | |
trainer = Trainer( | |
model=model.model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset | |
) | |
trainer.train() | |
# scripts/preprocess.py | |
from app.utils.data_preprocessing import load_language_data, preprocess_text | |
import os | |
def main(): | |
languages = ["yoruba", "igbo", "hausa"] | |
for lang in languages: | |
data = load_language_data("data/raw", lang) | |
processed_data = [preprocess_text(text) for text in data] | |
output_dir = f"data/processed/{lang}" | |
os.makedirs(output_dir, exist_ok=True) | |
with open(f"{output_dir}/processed_texts.txt", 'w', encoding='utf-8') as f: | |
f.writelines(processed_data) |