Spaces:
Runtime error
Runtime error
import argparse | |
import os | |
import boto3 | |
import datasets | |
import sagemaker | |
import sagemaker.s3 as S3Downloader | |
import transformers | |
from sagemaker.huggingface import HuggingFace | |
def train(trainDataPath: str, testDataPath: str, hyperparameters: dict | None = None): | |
sess = sagemaker.Session() | |
# sagemaker session bucket -> used for uploading data, models and logs | |
# sagemaker will automatically create this bucket if it not exists | |
sagemaker_session_bucket=None | |
if sagemaker_session_bucket is None and sess is not None: | |
# set to default bucket if a bucket name is not given | |
sagemaker_session_bucket = sess.default_bucket() | |
try: | |
role = sagemaker.get_execution_role() | |
except ValueError: | |
iam = boto3.client('iam') | |
role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] | |
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) | |
tokenizer_name = 'HaiderSultanArc/UnaniBERT' | |
tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_name) | |
def tokenize(batch): | |
return tokenizer(batch['sentence'], padding='max_length', truncation=True) | |
train_dataset = datasets.load_from_disk(trainDataPath) | |
test_dataset = datasets.load_from_disk(testDataPath) | |
train_dataset = train_dataset.map(tokenize, batched=True) | |
test_dataset = test_dataset.map(tokenize, batched=True) | |
train_dataset = train_dataset.rename_column("disease", "labels") | |
test_dataset = test_dataset.rename_column("disease", "labels") | |
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) | |
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) | |
# save train_dataset to s3 | |
training_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/train' | |
train_dataset.save_to_disk(training_input_path) | |
# save test_dataset to s3 | |
test_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/test' | |
test_dataset.save_to_disk(test_input_path) | |
# hyperparameters which are passed to the training job | |
hyperparameters = { | |
'epochs': 50, | |
'train_batch_size': 32, | |
'model_name': 'HaiderSultanArc/UnaniBERT' | |
} if hyperparameters is None else hyperparameters | |
# create the Estimator | |
huggingface_estimator = HuggingFace( | |
entry_point='train.py', | |
source_dir='./tasks/training', | |
instance_type='ml.p3.2xlarge', | |
instance_count=1, | |
role=role, | |
transformers_version='4.26', | |
pytorch_version='1.13', | |
py_version='py39', | |
hyperparameters = hyperparameters | |
) | |
huggingface_estimator.fit( | |
{ | |
'train': trainDataPath, | |
'test': testDataPath | |
} | |
) | |
predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge") |