File size: 2,879 Bytes
ba600a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import os

import boto3
import datasets
import sagemaker
import sagemaker.s3 as S3Downloader
import transformers
from sagemaker.huggingface import HuggingFace


def train(trainDataPath: str, testDataPath: str, hyperparameters: dict | None = None):
    sess = sagemaker.Session()
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sagemaker_session_bucket=None
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()

    try:
        role = sagemaker.get_execution_role()
    except ValueError:
        iam = boto3.client('iam')
        role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
    
    
    tokenizer_name = 'HaiderSultanArc/UnaniBERT'
    tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_name)
    
    def tokenize(batch):
        return tokenizer(batch['sentence'], padding='max_length', truncation=True)
    
    
    train_dataset = datasets.load_from_disk(trainDataPath)
    test_dataset = datasets.load_from_disk(testDataPath)
    
    train_dataset = train_dataset.map(tokenize, batched=True)
    test_dataset = test_dataset.map(tokenize, batched=True)
    
    train_dataset = train_dataset.rename_column("disease", "labels")
    test_dataset = test_dataset.rename_column("disease", "labels")
    
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    # save train_dataset to s3
    training_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/train'
    train_dataset.save_to_disk(training_input_path)

    # save test_dataset to s3
    test_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/test'
    test_dataset.save_to_disk(test_input_path)

    # hyperparameters which are passed to the training job
    hyperparameters = {
        'epochs': 50,
        'train_batch_size': 32,
        'model_name': 'HaiderSultanArc/UnaniBERT'
    } if hyperparameters is None else hyperparameters

    # create the Estimator
    huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir='./tasks/training',
        instance_type='ml.p3.2xlarge',
        instance_count=1,
        role=role,
        transformers_version='4.26',
        pytorch_version='1.13',
        py_version='py39',
        hyperparameters = hyperparameters
    )
    
    huggingface_estimator.fit(
        {
            'train': trainDataPath,
            'test': testDataPath
        }
    )
    
    predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge")