HaiderSultanArc's picture
AI Engine API
ba600a6
import argparse
import os
import boto3
import datasets
import sagemaker
import sagemaker.s3 as S3Downloader
import transformers
from sagemaker.huggingface import HuggingFace
def train(trainDataPath: str, testDataPath: str, hyperparameters: dict | None = None):
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
# set to default bucket if a bucket name is not given
sagemaker_session_bucket = sess.default_bucket()
try:
role = sagemaker.get_execution_role()
except ValueError:
iam = boto3.client('iam')
role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
tokenizer_name = 'HaiderSultanArc/UnaniBERT'
tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_name)
def tokenize(batch):
return tokenizer(batch['sentence'], padding='max_length', truncation=True)
train_dataset = datasets.load_from_disk(trainDataPath)
test_dataset = datasets.load_from_disk(testDataPath)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)
train_dataset = train_dataset.rename_column("disease", "labels")
test_dataset = test_dataset.rename_column("disease", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/train'
train_dataset.save_to_disk(training_input_path)
# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/test'
test_dataset.save_to_disk(test_input_path)
# hyperparameters which are passed to the training job
hyperparameters = {
'epochs': 50,
'train_batch_size': 32,
'model_name': 'HaiderSultanArc/UnaniBERT'
} if hyperparameters is None else hyperparameters
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='train.py',
source_dir='./tasks/training',
instance_type='ml.p3.2xlarge',
instance_count=1,
role=role,
transformers_version='4.26',
pytorch_version='1.13',
py_version='py39',
hyperparameters = hyperparameters
)
huggingface_estimator.fit(
{
'train': trainDataPath,
'test': testDataPath
}
)
predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge")