Spaces:

pimcore
/

fine-tuning-service

Sleeping

App Files Files Community

fashxp commited on Jul 8, 2024

Commit

264e02e

1 Parent(s): 7c4332a

cleanup and text classification

Browse files

Files changed (10) hide show

README.md +34 -2
src/image_classification/image_classification_parameters.py +12 -6
src/image_classification/image_classification_trainer.py +4 -4
src/main.py +124 -53
src/progress_callback.py +7 -5
src/task_manager.py +0 -72
src/text_classification/text_classification_parameters.py +63 -0
src/text_classification/text_classification_trainer.py +177 -0
src/training_manager.py +12 -11
src/training_status.py +8 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Fine Tuning Service
 emoji: 🦀
 colorFrom: green
 colorTo: yellow
@@ -8,5 +8,37 @@ pinned: false
 license: other
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Fine-Tuning Service
 emoji: 🦀
 colorFrom: green
 colorTo: yellow
 license: other
 ---
+# Pimcore Fine-Tuning Service
+This app provides endpoints to showcase fine-tuning of models for image and text classification tasks.
+It is possible to execute one training at a time and to get status information via the `/get_training_status` endpoint.
+Via the `/stop_training` endpoint stopping the currently running training is possible. After the training, the fine-tuned model is uploaded to huggingface hub.
+## Neccesary Environment Variables
+- `AUTHENTICATION_TOKEN`: Secret that is necessary to authorize calling the apps endpoints.
+- `HUGGINGFACE_TOKEN`: Huggingface token to be used for accessing the huggingface hub and uploading the models.
+- `HUGGINGFACE_ORGANIZATION`: Organization to be used for uploading the fine-tuned models.
+Further details for parameters of the endpoints see: https://your-domain/docs
+## Image Classification
+Use `/training/image_classification` for fine tuning a model for image classification tasks.
+#### Important Parameters
+- `training_data_zip`: The ZIP file containing the training data, with a folder per class which contains images belonging to that class.
+- `project_name`: The name of the project. Will also be used as name of resulting model that will be created after fine tuning and as the name of the repository at huggingface.
+- `source_model_name`: The source model to be used as basis for fine tuning.
+## Text Classification
+Use `/training/text_classification` for fine tuning a model for text classification tasks.
+#### Important Parameters
+- `training_data_csv`: The CSV file containing the training data, necessary columns `value` (text data) and `target` (classification).
+- `project_name`: The name of the project. Will also be used as name of resulting model that will be created after fine tuning and as the name of the repository at huggingface.
+- `source_model_name`: The source model to be used as basis for fine tuning.

src/image_classification/image_classification_parameters.py CHANGED Viewed

@@ -4,14 +4,16 @@ from fastapi import Form
 class ImageClassificationTrainingParameters(BaseModel):
     epochs: int
     learning_rate: float
 def map_image_classification_training_parameters(
-    epocs: Annotated[int, Form(...)] = 3,
-    learning_rate: Annotated[float, Form(...)] = 5e-5
 ) -> ImageClassificationTrainingParameters:
     return ImageClassificationTrainingParameters(
         epochs=epocs,
         learning_rate=learning_rate
@@ -19,23 +21,24 @@ def map_image_classification_training_parameters(
 class ImageClassificationParameters:
     __training_files_path: str
     __training_zip_file_path: str
-    __result_model_name: str
     __source_model_name: str
     __training_parameters: ImageClassificationTrainingParameters
     def __init__(self,
                  training_files_path: str,
                  training_zip_file_path: str,
-                 result_model_name: str,
                  source_model_name: str,
                  training_parameters: ImageClassificationTrainingParameters
                  ):
         self.__training_files_path = training_files_path
         self.__training_zip_file_path = training_zip_file_path
-        self.__result_model_name = result_model_name
         self.__source_model_name = source_model_name
         self.__training_parameters = training_parameters
@@ -46,7 +49,10 @@ class ImageClassificationParameters:
         return self.__training_zip_file_path
     def get_result_model_name(self) -> str:
-        return self.__result_model_name
     def get_source_model_name(self) -> str:
         return self.__source_model_name

 class ImageClassificationTrainingParameters(BaseModel):
+    """ Provides specific training parameters for the image classification fine tuning."""
     epochs: int
     learning_rate: float
 def map_image_classification_training_parameters(
+    epocs: Annotated[int, Form(description="Epochs executed during training.")] = 3,
+    learning_rate: Annotated[float, Form(description="Learning rate for training.")] = 5e-5
 ) -> ImageClassificationTrainingParameters:
+    """ Maps the parameters to the ImageClassificationTrainingParameters class. """
     return ImageClassificationTrainingParameters(
         epochs=epocs,
         learning_rate=learning_rate
 class ImageClassificationParameters:
+    """ Provides all parameters for the image classification fine tuning. """
     __training_files_path: str
     __training_zip_file_path: str
+    __project_name: str
     __source_model_name: str
     __training_parameters: ImageClassificationTrainingParameters
     def __init__(self,
                  training_files_path: str,
                  training_zip_file_path: str,
+                 project_name: str,
                  source_model_name: str,
                  training_parameters: ImageClassificationTrainingParameters
                  ):
         self.__training_files_path = training_files_path
         self.__training_zip_file_path = training_zip_file_path
+        self.__project_name = project_name
         self.__source_model_name = source_model_name
         self.__training_parameters = training_parameters
         return self.__training_zip_file_path
     def get_result_model_name(self) -> str:
+        return self.__project_name
+    def get_project_name(self) -> str:
+        return self.__project_name
     def get_source_model_name(self) -> str:
         return self.__source_model_name

src/image_classification/image_classification_trainer.py CHANGED Viewed

@@ -29,7 +29,7 @@ class ImageClassificationTrainer(AbstractTrainer):
         try:
             task = 'Extract training data'
-            self.get_status().update_status(0, task)
             logger.info(task)
             self.__extract_training_data(parameters)
@@ -63,7 +63,7 @@ class ImageClassificationTrainer(AbstractTrainer):
         finally:
             # Cleanup after processing
             logger.info('Cleaning up training files after training')
-            shutil.rmtree(parameters.get_training_files_path())
             if(self.get_status().is_training_aborted()):
                 self.get_status().finalize_abort_training("Training aborted")
@@ -94,7 +94,7 @@ class ImageClassificationTrainer(AbstractTrainer):
         images = images.train_test_split(test_size=0.2)
         logger.info(images)
-        logger.info(images["train"][100])
         # Preprocess the images
@@ -126,7 +126,7 @@ class ImageClassificationTrainer(AbstractTrainer):
         image_processor = AutoImageProcessor.from_pretrained(parameters.get_source_model_name())
         data_collator = DefaultDataCollator()
-        progressCallback = ProgressCallback(self.get_status())
         # Evaluate and metrics
         accuracy = evaluate.load("accuracy")

         try:
             task = 'Extract training data'
+            self.get_status().update_status(0, task, parameters.get_project_name())
             logger.info(task)
             self.__extract_training_data(parameters)
         finally:
             # Cleanup after processing
             logger.info('Cleaning up training files after training')
+            shutil.rmtree(parameters.get_training_files_path())
             if(self.get_status().is_training_aborted()):
                 self.get_status().finalize_abort_training("Training aborted")
         images = images.train_test_split(test_size=0.2)
         logger.info(images)
+        logger.info(images["train"][10])
         # Preprocess the images
         image_processor = AutoImageProcessor.from_pretrained(parameters.get_source_model_name())
         data_collator = DefaultDataCollator()
+        progressCallback = ProgressCallback(self.get_status(), 21, 89)
         # Evaluate and metrics
         accuracy = evaluate.load("accuracy")

src/main.py CHANGED Viewed

@@ -1,30 +1,26 @@
 import os
-import requests
 import torch
 from .training_status import Status
 from .environment_variable_checker import EnvironmentVariableChecker
-from .task_manager import TaskManager
 from .training_manager import TrainingManager
 from .image_classification.image_classification_trainer import ImageClassificationTrainer
 from .image_classification.image_classification_parameters import ImageClassificationParameters, map_image_classification_training_parameters, ImageClassificationTrainingParameters
-from fastapi import FastAPI, Header, Depends, HTTPException, BackgroundTasks, UploadFile, Form, File, status
-from fastapi.responses import FileResponse
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel
-from typing import Optional, Annotated
 import logging
-import sys
-import zipfile
 import os
 from pathlib import Path
 import tempfile
-import shutil
 app = FastAPI()
@@ -36,10 +32,22 @@ logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s')
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
-classification_trainer: TrainingManager = TrainingManager(ImageClassificationTrainer())
 security = HTTPBearer()
 def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     token = environmentVariableChecker.get_authentication_token()
@@ -52,32 +60,73 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     return {"token": credentials.credentials}
-class ResponseModel(BaseModel):
-    message: str
-    success: bool = True
 @app.post(
-    "/upload",
-    summary="Upload a zip file containing training data",
     response_model=ResponseModel
 )
-async def upload_file(
     training_params: Annotated[ImageClassificationTrainingParameters, Depends(map_image_classification_training_parameters)],
-    data_files_training: Annotated[UploadFile, File(...)],
     token_data: dict = Depends(verify_token),
-    result_model_name: str = Form(...),
-    source_model_name: str = Form('google/vit-base-patch16-224-in21k'),
 ):
     # check if training is running, if so then exit
     status = classification_trainer.get_task_status()
     if status.get_status() == Status.IN_PROGRESS or status.get_status() == Status.CANCELLING:
-        raise HTTPException(status_code=405, detail="Training is already in progress")
     # Ensure the uploaded file is a ZIP file
-    if not data_files_training.filename.endswith(".zip"):
-        raise HTTPException(status_code=422, detail="Uploaded file is not a zip file")
     try:
         # Create a temporary directory to extract the contents
@@ -85,7 +134,7 @@ async def upload_file(
         path = Path(tmp_path)
         path.mkdir(parents=True, exist_ok=True)
-        contents = await data_files_training.read()
         zip_path = os.path.join(tmp_path, 'image_classification_data.zip')
         with open(zip_path, 'wb') as temp_file:
             temp_file.write(contents)
@@ -94,52 +143,74 @@ async def upload_file(
         parameters = ImageClassificationParameters(
             training_files_path=tmp_path,
             training_zip_file_path=zip_path,
-            result_model_name=result_model_name,
             source_model_name=source_model_name,
             training_parameters=training_params
         )
         # start training
-        await classification_trainer.start_training(parameters)
-        # TODO add more return parameters and information
-        return ResponseModel(message="training started")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
-@app.get("/get_task_status")
-async def get_task_status(token_data: dict = Depends(verify_token)):
-    status = classification_trainer.get_task_status()
-    return {
-        "progress": status.get_progress(),
-        "task": status.get_task(),
-        "status": status.get_status().value
-    }
-@app.get("/stop_task")
-async def stop_task(token_data: dict = Depends(verify_token)):
-    try:
-        classification_trainer.stop_task()
-        return {
-            "success": True
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
-@app.get("/gpu_check")
-async def gpu_check():
-    gpu = 'GPU not available'
-    if torch.cuda.is_available():
-        gpu = 'GPU is available'
-        print("GPU is available")
-    else:
-        print("GPU is not available")
-    return {'success': True, 'response': 'hello world 3', 'gpu': gpu}

 import os
 import torch
 from .training_status import Status
 from .environment_variable_checker import EnvironmentVariableChecker
 from .training_manager import TrainingManager
 from .image_classification.image_classification_trainer import ImageClassificationTrainer
 from .image_classification.image_classification_parameters import ImageClassificationParameters, map_image_classification_training_parameters, ImageClassificationTrainingParameters
+from .text_classification.text_classification_trainer import TextClassificationTrainer
+from .text_classification.text_classification_parameters import TextClassificationParameters, map_text_classification_training_parameters, TextClassificationTrainingParameters
+from fastapi import FastAPI, Depends, HTTPException, UploadFile, Form, File, status
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel
+from typing import Annotated
 import logging
 import os
 from pathlib import Path
 import tempfile
 app = FastAPI()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
+classification_trainer: TrainingManager = TrainingManager()
+class ResponseModel(BaseModel):
+    """ Default pesponse model for endpoints. """
+    message: str
+    success: bool = True
+# ===========================================
+# Security Check
+# ===========================================
 security = HTTPBearer()
 def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """Verify the token provided by the user."""
     token = environmentVariableChecker.get_authentication_token()
     return {"token": credentials.credentials}
+# ===========================================
+# Training Status Endpoints
+# ===========================================
+@app.get("/get_training_status")
+async def get_task_status(token_data: dict = Depends(verify_token)):
+    """ Get the status of the currently running training (if any). """
+    status = classification_trainer.get_task_status()
+    return {
+        "project": status.get_project_name(),
+        "progress": status.get_progress(),
+        "task": status.get_task(),
+        "status": status.get_status().value
+    }
+@app.get("/stop_training")
+async def stop_task(token_data: dict = Depends(verify_token)):
+    """ Stop the currently running training (if any). """
+    try:
+        status = classification_trainer.get_task_status()
+        classification_trainer.stop_task()
+        return ResponseModel(message=f"Training stopped for `{ status.get_project_name() }`")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+@app.get("/gpu_check")
+async def gpu_check():
+    """ Check if a GPU is available """
+    gpu = 'GPU not available'
+    if torch.cuda.is_available():
+        gpu = 'GPU is available'
+        print("GPU is available")
+    else:
+        print("GPU is not available")
+    return {'success': True, 'gpu': gpu}
+# ===========================================
+# Fine-Tuning Image Classification
+# ===========================================
 @app.post(
+    "/training/image_classification",
     response_model=ResponseModel
 )
+async def image_classification(
     training_params: Annotated[ImageClassificationTrainingParameters, Depends(map_image_classification_training_parameters)],
+    training_data_zip: Annotated[UploadFile, File(description="The ZIP file containing the training data, with a folder per class which contains images belonging to that class.")],
     token_data: dict = Depends(verify_token),
+    project_name: str = Form(description="The name of the project. Will also be used as name of resulting model that will be created after fine tuning and as the name of the repository at huggingface."),
+    source_model_name: str = Form('google/vit-base-patch16-224-in21k', description="The source model to be used as basis for fine tuning."),
 ):
+    """
+    Start fine tuning an image classification model with the provided data.
+    """
     # check if training is running, if so then exit
     status = classification_trainer.get_task_status()
     if status.get_status() == Status.IN_PROGRESS or status.get_status() == Status.CANCELLING:
+        raise HTTPException(status_code=405, detail="Training is already in progress.")
     # Ensure the uploaded file is a ZIP file
+    if not training_data_zip.filename.endswith(".zip"):
+        raise HTTPException(status_code=422, detail="Uploaded file is not a zip file.")
     try:
         # Create a temporary directory to extract the contents
         path = Path(tmp_path)
         path.mkdir(parents=True, exist_ok=True)
+        contents = await training_data_zip.read()
         zip_path = os.path.join(tmp_path, 'image_classification_data.zip')
         with open(zip_path, 'wb') as temp_file:
             temp_file.write(contents)
         parameters = ImageClassificationParameters(
             training_files_path=tmp_path,
             training_zip_file_path=zip_path,
+            project_name=project_name,
             source_model_name=source_model_name,
             training_parameters=training_params
         )
         # start training
+        await classification_trainer.start_training(ImageClassificationTrainer(), parameters)
+        return ResponseModel(message="Training started.")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+# ===========================================
+# Fine-Tuning Text Classification
+# ===========================================
+@app.post(
+    "/training/text_classification",
+    response_model=ResponseModel
+)
+async def text_classificaiton(
+    training_params: Annotated[TextClassificationTrainingParameters, Depends(map_text_classification_training_parameters)],
+    training_data_csv: Annotated[UploadFile, File(description="The CSV file containing the training data, necessary columns `value` (text data) and `target` (classification).")],
+    token_data: dict = Depends(verify_token),
+    project_name: str = Form(description="The name of the project. Will also be used as name of resulting model that will be created after fine tuning and as the name of the repository at huggingface."),
+    training_csv_limiter: str = Form(';', description="The delimiter used in the CSV file."),
+    source_model_name: str = Form('distilbert/distilbert-base-uncased'),
+):
+    """Start fine tuning an text classification model with the provided data."""
+    # check if training is running, if so then exit
+    status = classification_trainer.get_task_status()
+    if status.get_status() == Status.IN_PROGRESS or status.get_status() == Status.CANCELLING:
+        raise HTTPException(status_code=405, detail="Training is already in progress")
+    # Ensure the uploaded file is a CSV file
+    if not training_data_csv.filename.endswith(".csv"):
+        raise HTTPException(status_code=422, detail="Uploaded file is not a csv file.")
+    try:
+        # Create a temporary directory to extract the contents
+        tmp_path = os.path.join(tempfile.gettempdir(), 'training_data')
+        path = Path(tmp_path)
+        path.mkdir(parents=True, exist_ok=True)
+        contents = await training_data_csv.read()
+        csv_path = os.path.join(tmp_path, 'data.csv')
+        with open(csv_path, 'wb') as temp_file:
+            temp_file.write(contents)
+        # prepare parameters
+        parameters = TextClassificationParameters(
+            training_csv_file_path=csv_path,
+            training_csv_limiter=training_csv_limiter,
+            project_name=project_name,
+            source_model_name=source_model_name,
+            training_parameters=training_params
+        )
+        # start training
+        await classification_trainer.start_training(TextClassificationTrainer(), parameters)
+        return ResponseModel(message="Training started.")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")

src/progress_callback.py CHANGED Viewed

@@ -10,9 +10,13 @@ logger.setLevel(logging.DEBUG)
 class ProgressCallback(TrainerCallback):
     __trainingStatus: TrainingStatus = None
-    def __init__(self, trainingStatus: TrainingStatus):
         self.__trainingStatus = trainingStatus
     def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
         logger.info(f"Completed step {state.global_step} of {state.max_steps}")
@@ -22,10 +26,8 @@ class ProgressCallback(TrainerCallback):
             logger.info("Training aborted")
             return
-        startPercentage = 21
-        endPercentage = 89
-        scope = endPercentage - startPercentage
-        progress = startPercentage + (state.global_step / state.max_steps) * scope
         self.__trainingStatus.update_status(progress, f"Training model, completed step {state.global_step} of {state.max_steps}")

 class ProgressCallback(TrainerCallback):
     __trainingStatus: TrainingStatus = None
+    __startPercentage: int = None
+    __endPercentage: int = None
+    def __init__(self, trainingStatus: TrainingStatus, startPercentage: int, endPercentage: int):
         self.__trainingStatus = trainingStatus
+        self.__startPercentage = startPercentage
+        self.__endPercentage = endPercentage
     def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
         logger.info(f"Completed step {state.global_step} of {state.max_steps}")
             logger.info("Training aborted")
             return
+        scope = self.__endPercentage - self.__startPercentage
+        progress = round(self.__startPercentage + (state.global_step / state.max_steps) * scope, 2)
         self.__trainingStatus.update_status(progress, f"Training model, completed step {state.global_step} of {state.max_steps}")

src/task_manager.py DELETED Viewed

@@ -1,72 +0,0 @@
-import asyncio
-import logging
-from fastapi import BackgroundTasks, HTTPException
-from concurrent.futures import ThreadPoolExecutor
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-class Worker:
-    def doing_work(self, task_manager):
-        task_manager.task_status["status"] = "Running"
-        for i in range(1, 101):
-            if task_manager.task_status["status"] == "Stopped":
-                break
-            asyncio.sleep(1)  # Simulate a time-consuming task
-            task_manager.task_status["progress"] = i
-            logger.info('process ' + str(i) + '%' + ' done')
-        if task_manager.task_status["status"] != "Stopped":
-            task_manager.task_status["status"] = "Completed"
-class TaskManager:
-    task_status = {"progress": 0, "status": "Not started"}
-    task = None
-    #def __init__(self):
-    worker = Worker()
-    async def doing_work(self):
-        loop = asyncio.get_running_loop()
-        with ThreadPoolExecutor() as pool:
-            await loop.run_in_executor(pool, self.worker.doing_work, self)
-            #self.worker.doing_work(self)
-        # self.task_status["status"] = "Running"
-        # for i in range(1, 101):
-        #     if self.task_status["status"] == "Stopped":
-        #         break
-        #     await asyncio.sleep(1)  # Simulate a time-consuming task
-        #     self.task_status["progress"] = i
-        #     logger.info('process ' + str(i) + '%' + ' done')
-        # if self.task_status["status"] != "Stopped":
-        #     self.task_status["status"] = "Completed"
-    async def start_task(self):
-        if self.task is None or self.task.done():
-            self.task_status["progress"] = 0
-            self.task_status["status"] = "Not started"
-            self.task = asyncio.create_task(self.doing_work())
-            return {"message": "Task started"}
-        else:
-            raise HTTPException(status_code=409, detail="Task already running")
-    async def get_task_status(self):
-        return self.task_status
-    async def stop_task(self):
-        if self.task is not None and not self.task.done():
-            self.task_status["status"] = "Stopped"
-            self.task.cancel()
-            return {"message": "Task stopped"}
-        else:
-            raise HTTPException(status_code=409, detail="No task running")

src/text_classification/text_classification_parameters.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from pydantic import BaseModel
+from typing import Annotated
+from fastapi import Form
+class TextClassificationTrainingParameters(BaseModel):
+    """ Provides specific training parameters for the text classification fine tuning."""
+    epochs: int
+    learning_rate: float
+def map_text_classification_training_parameters(
+    epocs: Annotated[int, Form(description="Epochs executed during training.")] = 3,
+    learning_rate: Annotated[float, Form(description="Learning rate for training.")] = 5e-5
+) -> TextClassificationTrainingParameters:
+    """ Maps the parameters to the TextClassificationTrainingParameters class. """
+    return TextClassificationTrainingParameters(
+        epochs=epocs,
+        learning_rate=learning_rate
+    )
+class TextClassificationParameters:
+    """ Provides all parameters for the text classification fine tuning. """
+    __training_csv_file_path: str
+    __training_csv_limiter: str
+    __project_name: str
+    __source_model_name: str
+    __training_parameters: TextClassificationTrainingParameters
+    def __init__(self,
+                 training_csv_file_path: str,
+                 project_name: str,
+                 source_model_name: str,
+                 training_parameters: TextClassificationTrainingParameters,
+                 training_csv_limiter: str = ';'
+                 ):
+        self.__training_csv_file_path = training_csv_file_path
+        self.__project_name = project_name
+        self.__source_model_name = source_model_name
+        self.__training_parameters = training_parameters
+        self.__training_csv_limiter = training_csv_limiter
+    def get_training_csv_file_path(self) -> str:
+        return self.__training_csv_file_path
+    def get_training_csv_limiter(self) -> str:
+        return self.__training_csv_limiter
+    def get_project_name(self) -> str:
+        return self.__project_name
+    def get_result_model_name(self) -> str:
+        return self.__project_name
+    def get_source_model_name(self) -> str:
+        return self.__source_model_name
+    def get_training_parameters(self) -> TextClassificationTrainingParameters:
+        return self.__training_parameters

src/text_classification/text_classification_trainer.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import logging
+from ..progress_callback import ProgressCallback
+from ..abstract_trainer import AbstractTrainer
+from ..environment_variable_checker import EnvironmentVariableChecker
+from .text_classification_parameters import TextClassificationParameters
+import shutil
+import os
+from datasets import load_dataset
+from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
+from huggingface_hub import HfFolder
+import evaluate
+import numpy as np
+from typing import Tuple
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+class TextClassificationTrainer(AbstractTrainer):
+    def start_training(self, parameters: TextClassificationParameters):
+        logger.info('Start Training...')
+        try:
+            task = 'Load and prepare training data'
+            self.get_status().update_status(0, task, parameters.get_project_name())
+            logger.info(task)
+            tokenized_dataset, labels, label2id, id2label = self.__prepare_training_data(parameters)
+            if(self.get_status().is_training_aborted()):
+                return
+            task = 'Start training model'
+            self.get_status().update_status(10, task)
+            logger.info(task)
+            self.__train_model(tokenized_dataset, labels, label2id, id2label, parameters)
+            self.get_status().update_status(100, "Training completed")
+        except Exception as e:
+            logger.error(e)
+            self.get_status().finalize_abort_training(str(e))
+            raise RuntimeError(f"An error occurred: {str(e)}")
+        finally:
+            # Cleanup after processing
+            logger.info('Cleaning up training files after training')
+            shutil.rmtree(os.path.dirname(parameters.get_training_csv_file_path()))
+            if(self.get_status().is_training_aborted()):
+                self.get_status().finalize_abort_training("Training aborted")
+    def __prepare_training_data(self, parameters: TextClassificationParameters) -> Tuple[dict, dict, dict, dict]:
+        dataset = load_dataset('csv', data_files=parameters.get_training_csv_file_path(), delimiter=parameters.get_training_csv_limiter())
+        dataset = dataset["train"]
+        dataset = dataset.train_test_split(test_size=0.2)
+        logger.info(dataset)
+        logger.info(dataset["train"][10])
+        # Tokenize the value column
+        tokenizer = AutoTokenizer.from_pretrained(parameters.get_source_model_name())
+        def preprocess_function(examples):
+            return tokenizer(examples["value"], truncation=True,     padding='max_length')
+        tokenized_dataset = dataset.map(preprocess_function, batched=True)
+        # Extract the labels
+        labels = tokenized_dataset['train'].unique('target')
+        label2id, id2label = dict(), dict()
+        for i, label in enumerate(labels):
+            label2id[label] = i
+            id2label[i] = label
+        logger.info(id2label)
+        # Rename the Target column to labels and remove unnecessary columns
+        tokenized_dataset = tokenized_dataset.rename_column('target', 'labels')
+        # Columns to keep
+        columns_to_keep = ['input_ids', 'labels', 'attention_mask']
+        all_columns = tokenized_dataset["train"].column_names
+        columns_to_remove = [col for col in all_columns if col not in columns_to_keep]
+        tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)
+        # Map labels to numeric ids
+        def map_labels(example):
+            example['labels'] = label2id[example['labels']]
+            return example
+        tokenized_dataset = tokenized_dataset.map(map_labels)
+        logger.info(tokenized_dataset)
+        logger.info(tokenized_dataset["train"][10])
+        return tokenized_dataset, labels, label2id, id2label
+    def __train_model(self, tokenized_dataset: dict, labels: dict, label2id: dict, id2label: dict, parameters: TextClassificationParameters):
+        tokenizer = AutoTokenizer.from_pretrained(parameters.get_source_model_name())
+        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+        environment_variable_checker = EnvironmentVariableChecker()
+        HfFolder.save_token(environment_variable_checker.get_huggingface_token())
+        progressCallback = ProgressCallback(self.get_status(), 11, 89)
+        # Evaluate and metrics
+        accuracy = evaluate.load("accuracy")
+        def compute_metrics(eval_pred):
+            predictions, labels = eval_pred
+            predictions = np.argmax(predictions, axis=1)
+            return accuracy.compute(predictions=predictions, references=labels)
+        # train the model
+        model = AutoModelForSequenceClassification.from_pretrained(
+            parameters.get_source_model_name(),
+            num_labels=len(labels),
+            id2label=id2label,
+            label2id=label2id
+        )
+        target_model_id = environment_variable_checker.get_huggingface_organization() + '/' + parameters.get_result_model_name()
+        training_args = TrainingArguments(
+            output_dir=parameters.get_result_model_name(),
+            hub_model_id=target_model_id,
+            learning_rate=parameters.get_training_parameters().learning_rate,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=16,
+            num_train_epochs=parameters.get_training_parameters().epochs,
+            weight_decay=0.01,
+            eval_strategy="epoch",
+            save_strategy="epoch",
+            load_best_model_at_end=True,
+            metric_for_best_model="accuracy",
+            push_to_hub=False,
+            remove_unused_columns=False,
+            hub_private_repo=True,
+        )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_dataset["train"],
+            eval_dataset=tokenized_dataset["test"],
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+            compute_metrics=compute_metrics,
+            callbacks=[progressCallback]
+        )
+        if(self.get_status().is_training_aborted()):
+            return
+        trainer.train()
+        if(self.get_status().is_training_aborted()):
+            return
+        logger.info(f"Model trained, start uploading")
+        self.get_status().update_status(90, f"Uploading model to Hugging Face")
+        trainer.push_to_hub()

src/training_manager.py CHANGED Viewed

@@ -15,11 +15,6 @@ class TrainingManager:
     __training_task = None
     __trainer: AbstractTrainer = None
-    task_status = {"progress": 0, "status": "Not started"}
-    def __init__(self, trainer: AbstractTrainer):
-        self.__trainer = trainer
     async def __do_start_training(self, parameters):
         logger.info('do start training')
@@ -29,22 +24,28 @@ class TrainingManager:
         logger.info('done')
-    async def start_training(self, parameters):
         logger.info('start training')
         if self.__training_task is None or self.__training_task.done():
             self.__training_task = asyncio.create_task(self.__do_start_training(parameters))
         else:
-            raise RuntimeError("Training already running")
     def get_task_status(self) -> TrainingStatus:
         return self.__trainer.get_status()
     def stop_task(self):
-        if self.__training_task is not None and not self.__training_task.done():
             self.__trainer.get_status().abort_training("Stopping training")
             #self.__training_task.cancel()
         else:
-            raise RuntimeError("No task running")

     __training_task = None
     __trainer: AbstractTrainer = None
     async def __do_start_training(self, parameters):
         logger.info('do start training')
         logger.info('done')
+    async def start_training(self, trainer: AbstractTrainer, parameters):
         logger.info('start training')
         if self.__training_task is None or self.__training_task.done():
+            self.__trainer = trainer
             self.__training_task = asyncio.create_task(self.__do_start_training(parameters))
         else:
+            raise RuntimeError("Training already running.")
     def get_task_status(self) -> TrainingStatus:
+        if self.__trainer is None:
+            return TrainingStatus()
         return self.__trainer.get_status()
     def stop_task(self):
+        if self.__training_task is not None and not self.__training_task.done() and self.__trainer is not None:
             self.__trainer.get_status().abort_training("Stopping training")
             #self.__training_task.cancel()
         else:
+            raise RuntimeError("No task running.")

src/training_status.py CHANGED Viewed

@@ -15,10 +15,12 @@ class Status(Enum):
 class TrainingStatus:
     __status: Status = Status.NOT_STARTED
     __task: str = None
     __progress: int = 0
-    def update_status(self, progress: int, task: str):
         if progress < 0 or progress > 100:
             raise ValueError("Progress must be between 0 and 100")
@@ -33,6 +35,9 @@ class TrainingStatus:
         if task is not None:
             self.__task = task
     def abort_training(self, task: str):
         self.__task = task
@@ -55,4 +60,6 @@ class TrainingStatus:
     def get_task(self) -> str:
         return self.__task

 class TrainingStatus:
     __status: Status = Status.NOT_STARTED
+    __project_name: str = None
     __task: str = None
     __progress: int = 0
+    def update_status(self, progress: int, task: str, project_name: str = None):
         if progress < 0 or progress > 100:
             raise ValueError("Progress must be between 0 and 100")
         if task is not None:
             self.__task = task
+        if project_name is not None:
+            self.__project_name = project_name
     def abort_training(self, task: str):
         self.__task = task
     def get_task(self) -> str:
         return self.__task
+    def get_project_name(self) -> str:
+        return self.__project_name