Azerbaijani / train.py
BusinessDev's picture
Update train.py
e0cc050 verified
raw
history blame
2.27 kB
from transformers import BertModel, BertTokenizer, TrainingArguments, Trainer
from datasets import Dataset
# Prepare the dataset (simplified)
def prepare_text_dataset(data, chunk_size):
# Split the text into smaller chunks (consider logical divisions of the Constitution)
chunks = [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
# Convert chunks to dictionaries with a single feature "text"
formatted_data = [{"text": chunk} for chunk in chunks]
# Create the dataset from the list of dictionaries
formatted_dataset = Dataset.from_list(formatted_data)
# Tokenize the text using the MBart tokenizer
formatted_dataset = formatted_dataset.map(
lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
batched=True
)
# Set the format of the dataset to "torch" for compatibility with the model
formatted_dataset.set_format("torch")
# Print a message indicating preparation completion (optional)
print('Prep done')
return formatted_dataset
def init():
# Load the model and tokenizer
model_name = "language-ml-lab/AzerBert" # Replace with your model name if different
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
chunk_size = 512
# Load the plain text (replace with your actual loading logic)
with open("constitution.txt", "r", encoding="utf-8") as f:
constitution_text = f.read()
# Prepare the dataset
train_dataset = prepare_text_dataset(constitution_text, chunk_size)
# Define training arguments
training_args = TrainingArguments(
output_dir="./results", # Adjust output directory
overwrite_output_dir=True,
num_train_epochs=3, # Adjust training epochs
per_device_train_batch_size=1, # Adjust batch size based on your GPU memory
save_steps=500,
save_total_limit=2,
)
# Create the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Start training
trainer.train()
# Save the fine-tuned model
model.save_pretrained("./fine-tuned_model")
tokenizer.save_pretrained("./fine-tuned_model")