Spaces:
Sleeping
Sleeping
File size: 1,853 Bytes
406c1a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
from uuid import uuid4
from datasets import load_dataset
from autotrain.dataset import AutoTrainDataset
from autotrain.project import Project
RANDOM_ID = str(uuid4())
DATASET = "imdb"
PROJECT_NAME = f"imdb_{RANDOM_ID}"
TASK = "text_binary_classification"
MODEL = "bert-base-uncased"
USERNAME = os.environ["AUTOTRAIN_USERNAME"]
TOKEN = os.environ["HF_TOKEN"]
if __name__ == "__main__":
dataset = load_dataset(DATASET)
train = dataset["train"]
validation = dataset["test"]
# convert to pandas dataframe
train_df = train.to_pandas()
validation_df = validation.to_pandas()
# prepare dataset for AutoTrain
dset = AutoTrainDataset(
train_data=[train_df],
valid_data=[validation_df],
task=TASK,
token=TOKEN,
project_name=PROJECT_NAME,
username=USERNAME,
column_mapping={"text": "text", "label": "label"},
percent_valid=None,
)
dset.prepare()
#
# How to get params for a task:
#
# from autotrain.params import Params
# params = Params(task=TASK, training_type="hub_model").get()
# print(params) to get full list of params for the task
# define params in proper format
job1 = {
"task": TASK,
"learning_rate": 1e-5,
"optimizer": "adamw_torch",
"scheduler": "linear",
"epochs": 5,
}
job2 = {
"task": TASK,
"learning_rate": 3e-5,
"optimizer": "adamw_torch",
"scheduler": "cosine",
"epochs": 5,
}
job3 = {
"task": TASK,
"learning_rate": 5e-5,
"optimizer": "sgd",
"scheduler": "cosine",
"epochs": 5,
}
jobs = [job1, job2, job3]
project = Project(dataset=dset, hub_model=MODEL, job_params=jobs)
project_id = project.create()
project.approve(project_id) |