Spaces:
Sleeping
Sleeping
File size: 7,152 Bytes
8a35bc0 264e02e ade1b4d 264e02e ade1b4d 264e02e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
# -------------------------------------------------------------------
# Pimcore
#
# This source file is available under two different licenses:
# - GNU General Public License version 3 (GPLv3)
# - Pimcore Commercial License (PCL)
# Full copyright and license information is available in
# LICENSE.md which is distributed with this source code.
#
# @copyright Copyright (c) Pimcore GmbH (http://www.pimcore.org)
# @license http://www.pimcore.org/license GPLv3 and PCL
# -------------------------------------------------------------------
import logging
from ..progress_callback import ProgressCallback
from ..abstract_trainer import AbstractTrainer
from ..environment_variable_checker import EnvironmentVariableChecker
from .text_classification_parameters import TextClassificationParameters
import shutil
import os
from datasets import load_dataset
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from huggingface_hub import HfFolder
import evaluate
import numpy as np
from typing import Tuple
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class TextClassificationTrainer(AbstractTrainer):
def start_training(self, parameters: TextClassificationParameters):
logger.info('Start Training...')
try:
task = 'Load and prepare training data'
self.get_status().update_status(0, task, parameters.get_project_name())
logger.info(task)
tokenized_dataset, labels, label2id, id2label = self.__prepare_training_data(parameters)
if(self.get_status().is_training_aborted()):
return
task = 'Start training model'
self.get_status().update_status(10, task)
logger.info(task)
self.__train_model(tokenized_dataset, labels, label2id, id2label, parameters)
if(self.get_status().is_training_aborted()):
return
self.get_status().update_status(100, "Training completed")
except Exception as e:
logger.error(e)
self.get_status().finalize_abort_training(str(e))
raise RuntimeError(f"An error occurred: {str(e)}")
finally:
# Cleanup after processing
logger.info('Cleaning up training files after training')
shutil.rmtree(os.path.dirname(parameters.get_training_csv_file_path()))
if(self.get_status().is_training_aborted()):
self.get_status().finalize_abort_training("Training aborted")
def __prepare_training_data(self, parameters: TextClassificationParameters) -> Tuple[dict, dict, dict, dict]:
dataset = load_dataset('csv', data_files=parameters.get_training_csv_file_path(), delimiter=parameters.get_training_csv_limiter())
dataset = dataset["train"]
# Extract the labels
#labels = tokenized_dataset['train'].unique('target')
labels = dataset.unique('target')
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
label2id[label] = i
id2label[i] = label
logger.info(id2label)
dataset = dataset.train_test_split(test_size=0.2)
logger.info(dataset)
logger.info(dataset["train"][10])
# Tokenize the value column
tokenizer = AutoTokenizer.from_pretrained(parameters.get_source_model_name())
def preprocess_function(examples):
return tokenizer(examples["value"], truncation=True, padding='max_length')
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# Rename the Target column to labels and remove unnecessary columns
tokenized_dataset = tokenized_dataset.rename_column('target', 'labels')
# Columns to keep
columns_to_keep = ['input_ids', 'labels', 'attention_mask']
all_columns = tokenized_dataset["train"].column_names
columns_to_remove = [col for col in all_columns if col not in columns_to_keep]
tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)
# Map labels to numeric ids
def map_labels(example):
example['labels'] = label2id[example['labels']]
return example
tokenized_dataset = tokenized_dataset.map(map_labels)
logger.info(tokenized_dataset)
logger.info(tokenized_dataset["train"][10])
return tokenized_dataset, labels, label2id, id2label
def __train_model(self, tokenized_dataset: dict, labels: dict, label2id: dict, id2label: dict, parameters: TextClassificationParameters):
tokenizer = AutoTokenizer.from_pretrained(parameters.get_source_model_name())
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
environment_variable_checker = EnvironmentVariableChecker()
HfFolder.save_token(environment_variable_checker.get_huggingface_token())
progressCallback = ProgressCallback(self.get_status(), 11, 89)
# Evaluate and metrics
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
# train the model
model = AutoModelForSequenceClassification.from_pretrained(
parameters.get_source_model_name(),
num_labels=len(labels),
id2label=id2label,
label2id=label2id
)
target_model_id = environment_variable_checker.get_huggingface_organization() + '/' + parameters.get_result_model_name()
training_args = TrainingArguments(
output_dir=parameters.get_result_model_name(),
hub_model_id=target_model_id,
learning_rate=parameters.get_training_parameters().learning_rate,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=parameters.get_training_parameters().epochs,
weight_decay=0.01,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
push_to_hub=False,
remove_unused_columns=False,
hub_private_repo=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
callbacks=[progressCallback]
)
if(self.get_status().is_training_aborted()):
return
trainer.train()
if(self.get_status().is_training_aborted()):
return
logger.info(f"Model trained, start uploading")
self.get_status().update_status(90, f"Uploading model to Hugging Face")
trainer.push_to_hub() |