File size: 2,882 Bytes
06969bb
 
96284fd
06969bb
 
 
 
 
 
 
 
 
 
 
 
23b1945
c10dbbf
23b1945
 
 
06969bb
23b1945
06969bb
 
 
 
 
 
 
 
 
23b1945
06969bb
23b1945
06969bb
23b1945
06969bb
 
23b1945
06969bb
 
 
 
23b1945
06969bb
23b1945
06969bb
 
 
 
 
 
 
23b1945
06969bb
 
23b1945
 
 
 
06969bb
 
 
 
 
 
23b1945
06969bb
96284fd
 
06969bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96284fd
06969bb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import gradio as gr
from transformers import pipeline
import logging

# Enable detailed logging
logging.basicConfig(level=logging.INFO)

# Load dataset
dataset = load_dataset("mwitiderrick/swahili")

# Print dataset columns for verification
print(f"Dataset columns: {dataset['train'].column_names}")

# Select a subset of the dataset (e.g., first 100,000 rows)
subset_size = 50000  # Adjust the size as needed
subset_dataset = dataset["train"].select(range(min(subset_size, len(dataset["train"]))))
print(f"Using a subset of {len(subset_dataset)} rows for training.")

# Initialize the tokenizer and model
model_name = "gpt2"  # Use GPT-2 for text generation
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# Preprocess the dataset
def preprocess_function(examples):
    # Tokenize and format the dataset
    encodings = tokenizer(
        examples['text'],  # Use 'text' column from your dataset
        truncation=True,
        padding='max_length',  # Ensure consistent length
        max_length=512
    )
    encodings['labels'] = encodings['input_ids']  # Use input_ids directly as labels
    return encodings

# Tokenize the dataset
try:
    tokenized_datasets = subset_dataset.map(
        preprocess_function, 
        batched=True
    )
except Exception as e:
    print(f"Error during tokenization: {e}")

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=500,  # Log every 500 steps
    evaluation_strategy="steps",  # Use evaluation strategy
    save_steps=10_000,  # Save checkpoint every 10,000 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

# Start training
try:
    trainer.train()
except Exception as e:
    print(f"Error during training: {e}")

# Define the Gradio interface function
nlp = pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_text(prompt):
    try:
        return nlp(prompt, max_length=50)[0]['generated_text']
    except Exception as e:
        return f"Error during text generation: {e}"

# Create and launch the Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs="text",
    outputs="text",
    title="Swahili Language Model",
    description="Generate text in Swahili using a pre-trained language model."
)

iface.launch()