metadata
base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
datasets:
- generator
library_name: peft
license: apache-2.0
tags:
- trl
- sft
- generated_from_trainer
- african-languages
benchmark_visualization: assets/Benchmarks_(1).pdf
model-index:
- name: llama-8b-south-africa
results:
- task:
type: text-generation
name: African Language Evaluation
dataset:
name: afrimgsm_direct_xho
type: text-classification
split: test
metrics:
- name: Accuracy
type: accuracy
value: 0.02
- task:
type: text-generation
name: African Language Evaluation
dataset:
name: afrimgsm_direct_zul
type: text-classification
split: test
metrics:
- name: Accuracy
type: accuracy
value: 0.045
- task:
type: text-generation
name: African Language Evaluation
dataset:
name: afrimmlu_direct_xho
type: text-classification
split: test
metrics:
- name: Accuracy
type: accuracy
value: 0.29
- task:
type: text-generation
name: African Language Evaluation
dataset:
name: afrimmlu_direct_zul
type: text-classification
split: test
metrics:
- name: Accuracy
type: accuracy
value: 0.29
- task:
type: text-generation
name: African Language Evaluation
dataset:
name: afrixnli_en_direct_xho
type: text-classification
split: test
metrics:
- name: Accuracy
type: accuracy
value: 0.44
- task:
type: text-generation
name: African Language Evaluation
dataset:
name: afrixnli_en_direct_zul
type: text-classification
split: test
metrics:
- name: Accuracy
type: accuracy
value: 0.43
model_description: >
This model is a fine-tuned version of
[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
on the generator dataset.
[Alpaca Cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned)
translated into Xhose, Zulu, Tswana, Northern Sotho and Afrikaans using
machine translation.
The model could only be evaluated in Xhosa and Zulu due to Iroko language
availability. Its aim is to show cross-lingual transfer can be achieved at a
low cost. Translation cost roughly $370 per language and training cost roughly
$15 using an Akash Compute Network GPU.
training_details:
loss: 1.0571
hyperparameters:
learning_rate: 0.0002
train_batch_size: 4
eval_batch_size: 8
seed: 42
distributed_type: multi-GPU
gradient_accumulation_steps: 2
total_train_batch_size: 8
optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
lr_scheduler_type: cosine
lr_scheduler_warmup_ratio: 0.1
num_epochs: 1
training_results:
final_loss: 1.0959
epochs: 0.9999
steps: 5596
validation_loss: 1.0571
framework_versions:
peft: 0.12.0
transformers: 4.44.2
pytorch: 2.4.1+cu121
datasets: 3.0.0
tokenizers: 0.19.1