import re import os import transformers import torch from transformers import TextDataset, DataCollatorForLanguageModeling from transformers import GPT2Tokenizer, GPT2LMHeadModel from transformers import Trainer, TrainingArguments print(torch.cuda.is_available()) def load_dataset(file_path, tokenizer, block_size=128): dataset = TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=block_size, ) return dataset def load_data_collator(tokenizer, mlm=False): data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=mlm, ) return data_collator def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps, resume_from_checkpoint): from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("malteos/gpt2-uk") train_dataset = load_dataset(train_file_path, tokenizer) data_collator = load_data_collator(tokenizer) tokenizer.save_pretrained(output_dir) model = AutoModelForCausalLM.from_pretrained("malteos/gpt2-uk") model.save_pretrained(output_dir) training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=overwrite_output_dir, per_device_train_batch_size=per_device_train_batch_size, num_train_epochs=num_train_epochs, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train(resume_from_checkpoint=resume_from_checkpoint) trainer.save_model() train_directory = 'H:/Finetunning/q_and_a' train_file_path = 'H:/Finetunning/journal.txt' model_name = train_directory output_dir = 'H:/Finetunning/custom_full_text' overwrite_output_dir = False per_device_train_batch_size = 8 num_train_epochs = 51 save_steps = 50000 print("Починаємо навчання...") train( train_file_path=train_file_path, model_name=model_name, output_dir=output_dir, overwrite_output_dir=overwrite_output_dir, per_device_train_batch_size=per_device_train_batch_size, num_train_epochs=num_train_epochs, save_steps=save_steps, resume_from_checkpoint=True # False для першого разу, True - з якоїсь точки остановки )