# THIS file is meant to be used once hence not having functions just sequential code import pandas as pd from transformers import AutoTokenizer, set_seed from transformers import DataCollatorForLanguageModeling from transformers import AutoModelForCausalLM, TrainingArguments, Trainer from datasets import Dataset from z_utils import get_dataframe # CONST INP_DATASET_CSV = "clean_books_summary.csv" BASE_CASUAL_MODEL = "openai-community/gpt2" # TRAINED_MODEL_OUTPUT_DIR = "gpt2-book-summary-generator" # same name for HF Hub TRAINED_MODEL_OUTPUT_DIR = "content" # same name for HF Hub set_seed(42) EPOCHS = 1 LR = 2e-5 # Load dataset books: pd.DataFrame = get_dataframe(INP_DATASET_CSV) # Create HF dataset, easier to perform preprocessing at scale dataset_books = Dataset.from_pandas(books, split="train") # Loading Tokenizer tokenizer = AutoTokenizer.from_pretrained(BASE_CASUAL_MODEL) # Data Preprocessing def preprocess_function(book): '''Funtion to convert dataset to in prompt form ''' # Its Multiline, so DONT put tabs in this editor view otherwise it will get inside string text = f'''Genre: {book['categories']} Book Title: {book['book_name']} Description: {book['book_name']} {book['summaries']} ''' return tokenizer(text) # Apply Preprocessing tokenized_dataset_books = dataset_books.map( preprocess_function, # batched=True, num_proc=4, remove_columns=dataset_books.column_names, ) # Data Collator, req for Casual LM tokenizer.pad_token = tokenizer.eos_token data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Load Casual LM model = AutoModelForCausalLM.from_pretrained(BASE_CASUAL_MODEL) training_args = TrainingArguments( output_dir=TRAINED_MODEL_OUTPUT_DIR, eval_strategy="no", learning_rate=LR, weight_decay=0.01, push_to_hub=True, num_train_epochs=EPOCHS, report_to="none" ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset_books, # eval_dataset=lm_dataset["test"], data_collator=data_collator, tokenizer=tokenizer, ) # Start training trainer.train() # Commit model files to HF trainer.push_to_hub()