Spaces:

BusinessDev
/

Azerbaijani

Runtime error

App Files Files Community

Azerbaijani / train.py

BusinessDev

Update train.py

e0cc050 verified 4 months ago

raw

history blame

2.27 kB

	from transformers import BertModel, BertTokenizer, TrainingArguments, Trainer
	from datasets import Dataset



	# Prepare the dataset (simplified)
	def prepare_text_dataset(data, chunk_size):
	# Split the text into smaller chunks (consider logical divisions of the Constitution)
	chunks = [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
	# Convert chunks to dictionaries with a single feature "text"
	formatted_data = [{"text": chunk} for chunk in chunks]
	# Create the dataset from the list of dictionaries
	formatted_dataset = Dataset.from_list(formatted_data)
	# Tokenize the text using the MBart tokenizer
	formatted_dataset = formatted_dataset.map(
	lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
	batched=True
	)

	# Set the format of the dataset to "torch" for compatibility with the model
	formatted_dataset.set_format("torch")
	# Print a message indicating preparation completion (optional)
	print('Prep done')

	return formatted_dataset

	def init():
	# Load the model and tokenizer
	model_name = "language-ml-lab/AzerBert" # Replace with your model name if different
	tokenizer = BertTokenizer.from_pretrained(model_name)
	model = BertModel.from_pretrained(model_name)
	chunk_size = 512

	# Load the plain text (replace with your actual loading logic)
	with open("constitution.txt", "r", encoding="utf-8") as f:
	constitution_text = f.read()

	# Prepare the dataset
	train_dataset = prepare_text_dataset(constitution_text, chunk_size)

	# Define training arguments
	training_args = TrainingArguments(
	output_dir="./results", # Adjust output directory
	overwrite_output_dir=True,
	num_train_epochs=3, # Adjust training epochs
	per_device_train_batch_size=1, # Adjust batch size based on your GPU memory
	save_steps=500,
	save_total_limit=2,
	)

	# Create the Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	)

	# Start training
	trainer.train()

	# Save the fine-tuned model
	model.save_pretrained("./fine-tuned_model")
	tokenizer.save_pretrained("./fine-tuned_model")