import argparse import os import boto3 import datasets import sagemaker import sagemaker.s3 as S3Downloader import transformers from sagemaker.huggingface import HuggingFace def train(trainDataPath: str, testDataPath: str, hyperparameters: dict | None = None): sess = sagemaker.Session() # sagemaker session bucket -> used for uploading data, models and logs # sagemaker will automatically create this bucket if it not exists sagemaker_session_bucket=None if sagemaker_session_bucket is None and sess is not None: # set to default bucket if a bucket name is not given sagemaker_session_bucket = sess.default_bucket() try: role = sagemaker.get_execution_role() except ValueError: iam = boto3.client('iam') role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) tokenizer_name = 'HaiderSultanArc/UnaniBERT' tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_name) def tokenize(batch): return tokenizer(batch['sentence'], padding='max_length', truncation=True) train_dataset = datasets.load_from_disk(trainDataPath) test_dataset = datasets.load_from_disk(testDataPath) train_dataset = train_dataset.map(tokenize, batched=True) test_dataset = test_dataset.map(tokenize, batched=True) train_dataset = train_dataset.rename_column("disease", "labels") test_dataset = test_dataset.rename_column("disease", "labels") train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) # save train_dataset to s3 training_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/train' train_dataset.save_to_disk(training_input_path) # save test_dataset to s3 test_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/test' test_dataset.save_to_disk(test_input_path) # hyperparameters which are passed to the training job hyperparameters = { 'epochs': 50, 'train_batch_size': 32, 'model_name': 'HaiderSultanArc/UnaniBERT' } if hyperparameters is None else hyperparameters # create the Estimator huggingface_estimator = HuggingFace( entry_point='train.py', source_dir='./tasks/training', instance_type='ml.p3.2xlarge', instance_count=1, role=role, transformers_version='4.26', pytorch_version='1.13', py_version='py39', hyperparameters = hyperparameters ) huggingface_estimator.fit( { 'train': trainDataPath, 'test': testDataPath } ) predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge")