LLM-FineTuning-Notebook-Generator / utils /components_creator.py
menouar
Update the generated Notebook to push properly to HF
b8758c8
raw
history blame
13.3 kB
from typing import Set
import gradio as gr
from gradio.components import Component
from utils import *
def add_quantization_components() -> Set[Component]:
q_components: Set[Component] = set()
load_in_4bit = gr.Radio(["load_in_4bit", "load_in_8bit"], value="load_in_4bit",
label="Quantization",
info="This flag is used to enable 4/8-bit "
"quantization.",
interactive=True,
elem_id=LOAD_IN_4_BIT_ID)
bnb_4bit_quant_type = gr.Radio(["fp4", "nf4"], label="bnb_4bit_quant_type",
value="nf4",
elem_id=BNB_4BIT_QUANT_TYPE,
interactive=True,
info="This sets the quantization data type in "
"the bnb.nn.Linear4Bit "
"layers.")
q_components.add(load_in_4bit)
q_components.add(bnb_4bit_quant_type)
return q_components
def add_quantization_components1() -> Set[Component]:
q_components: Set[Component] = set()
bnb_4bit_compute_dtype = gr.Radio(
["torch.float32", "torch.bfloat16", "torch.float16"],
label="bnb_4bit_compute_dtype",
info="This sets the computational type which might be different "
"than the input type.",
elem_id=BNB_4BIT_COMPUTE_DTYPE,
interactive=True, value="torch.bfloat16")
bnb_4bit_use_double_quant = gr.Checkbox(label="bnb_4bit_use_double_quant",
value=True,
interactive=True,
elem_id=BNB_4BIT_USE_DOUBLE_QUANT,
info="This flag is used for nested "
"quantization where the "
"quantization constants from "
"the first "
"quantization are quantized "
"again.")
q_components.add(bnb_4bit_compute_dtype)
q_components.add(bnb_4bit_use_double_quant)
return q_components
def add_dataset_components() -> Set[Component]:
dataset_selection = gr.Dropdown(
[dt.path for dt in ft_datasets],
elem_id=DATASET_SELECTION_ID,
label="Select a Dataset",
info="Select a dataset for finetuning the model."
)
seed = gr.Slider(0, 256, step=1, value=42, elem_id=DATASET_SHUFFLING_SEED, label="Random Seed",
info="Set a random seed for shuffling the dataset.", interactive=True)
d_components: Set[Component] = set()
d_components.add(dataset_selection)
d_components.add(seed)
return d_components
def add_pad_tokens() -> Set[Component]:
pad_token_side = gr.Radio(["right", "left"], label="Tokenizer: padding_side",
info="The side on which the model should have padding applied.",
interactive=True, value="right", elem_id=PAD_SIDE_ID)
pad_token_value = gr.Radio([None, "eos_token"], label="Tokenizer: pad_token",
info="A special token used to make arrays of tokens the same size for batching "
"purpose. Will then be "
"ignored by attention mechanisms or loss computation.",
interactive=True, value=None, elem_id=PAD_VALUE_ID)
pad_components: Set[Component] = set()
pad_components.add(pad_token_side)
pad_components.add(pad_token_value)
return pad_components
def add_lora_components() -> Set[Component]:
r = gr.Slider(1, 2048, step=1, value=6, label="r", info="Lora attention dimension (the 'rank').",
interactive=True, elem_id=LORA_R_ID)
alpha = gr.Slider(1, 512, step=1, value=8, label="lora_alpha", info="The alpha parameter for Lora scaling.",
interactive=True, elem_id=LORA_ALPHA_ID)
out_components: Set[Component] = set()
out_components.add(r)
out_components.add(alpha)
return out_components
def add_lora_components1() -> Set[Component]:
dropout = gr.Slider(0, 1, step=0.01, value=0.05, label="lora_dropout",
info="The dropout probability for Lora layers.",
interactive=True, elem_id=LORA_DROPOUT_ID)
bias = gr.Radio(['none', 'all', 'lora_only'], label="bias",
info="Bias type for LoRA. If 'all' or 'lora_only', the corresponding biases will be updated during "
"training.",
interactive=True, value="none", elem_id=LORA_BIAS_ID)
out_components: Set[Component] = set()
out_components.add(dropout)
out_components.add(bias)
return out_components
def add_training_args_1() -> Set[Component]:
epochs = gr.Slider(1, 100, step=1, value=3, label="num_train_epochs",
info="Total number of training epochs to perform.",
interactive=True, elem_id=NUM_TRAIN_EPOCHS_ID)
max_steps = gr.Slider(-1, 100, step=1, value=-1, label="max_steps",
info="Total number of training steps to perform. if equals to -1 it overrides "
"num_train_epochs.",
interactive=True, elem_id=MAX_STEPS_ID)
out_components: Set[Component] = set()
out_components.add(epochs)
out_components.add(max_steps)
return out_components
def add_training_args_1_bis() -> Set[Component]:
logging_steps = gr.Slider(1, 100, step=1, value=10, label="logging_steps",
info="Number of update steps between two logs if logging_strategy='steps'",
interactive=True, elem_id=LOGGING_STEPS_ID)
per_device_train_batch_size = gr.Slider(1, 64, step=1, value=4, label="per_device_train_batch_size",
info="Batch size per device during training.",
interactive=True, elem_id=PER_DEVICE_TRAIN_BATCH_SIZE)
save_strategy = gr.Radio(['no', 'epoch', 'steps'], label="save_strategy",
info="The checkpoint save strategy to adopt during training.",
interactive=True, value="epoch", elem_id=SAVE_STRATEGY_ID)
out_components: Set[Component] = set()
out_components.add(save_strategy)
out_components.add(logging_steps)
out_components.add(per_device_train_batch_size)
return out_components
def add_training_args_3() -> Set[Component]:
max_grad_norm = gr.Slider(0.01, 1, value=0.3, label="max_grad_norm",
info="Maximum gradient norm (for gradient clipping).",
interactive=True, elem_id=MAX_GRAD_NORM_ID)
warmup_ratio = gr.Slider(0, 1, value=0.1, label="warmup_ratio",
info="Ratio of total training steps used for a linear warmup from 0 to learning_rate.",
interactive=True, elem_id=WARMUP_RATIO_ID)
gradient_accumulation_steps = gr.Slider(1, 64, step=1, value=2, label="gradient_accumulation_steps",
info="Number of updates steps to accumulate the gradients for, before "
"performing a backward/update "
"pass.",
interactive=True, elem_id=GRADIENT_ACCUMULATION_STEPS_ID)
gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True, interactive=True,
info="Use gradient checkpointing to save memory at the expense of slower "
"backward pass.", elem_id=GRADIENT_CHECKPOINTING_ID)
lr_scheduler_type = gr.Radio(['linear', 'constant', 'cosine'], label="lr_scheduler_type",
info="The learning rate scheduler type to use.",
interactive=True, value="cosine", elem_id=LR_SCHEDULER_TYPE_ID)
out_components: Set[Component] = set()
out_components.add(max_grad_norm)
out_components.add(warmup_ratio)
out_components.add(gradient_accumulation_steps)
out_components.add(gradient_checkpointing)
out_components.add(lr_scheduler_type)
return out_components
def add_outputs() -> (Component, Component):
output_dir = gr.Textbox(interactive=True,
label="output_dir",
info='The output directory where the model predictions and checkpoints will be written.',
elem_id=OUTPUT_DIR_ID)
push_to_hub = gr.Checkbox(label="push_to_hub", value=False, interactive=True,
info="Whether or not to upload the trained model to the hub after training. If this is "
"True, you must specify 'HF_TOKEN'.",
elem_id=PUSH_TO_HUB_ID)
return output_dir, push_to_hub
def add_hf_repo_cmp() -> Component:
repo_name = gr.Textbox(label="HF Repo name",
placeholder="username/your_repository",
info="Hugging Face repository to be created.",
interactive=True,
visible=False,
elem_id=REPOSITORY_NAME_ID)
return repo_name
def add_outputs1() -> Set[Component]:
report_to = gr.Dropdown(
["azure_ml", "comet_ml", "mlflow", "tensorboard", "wandb", "all", 'none'],
value="tensorboard",
elem_id=REPORT_TO_ID,
label="report_to",
info="The list of integrations to report the results and logs to. Supported platforms are 'azure_ml', "
"'comet_ml', 'mlflow', 'tensorboard' and 'wandb'. Use 'all' to report to all integrations installed, "
"'none' for no integrations."
)
out_components: Set[Component] = set()
out_components.add(report_to)
return out_components
def add_optimizer() -> Set[Component]:
adam_beta1 = gr.Slider(0.00001, 1, value=0.9, label="adam_beta1",
info="The beta1 hyperparameter for the [`AdamW`] optimizer.",
interactive=True, elem_id=BETA1_ID)
adam_beta2 = gr.Slider(0.00001, 1, value=0.999, label="adam_beta2",
info="The beta2 hyperparameter for the [`AdamW`] optimizer.",
interactive=True, elem_id=BETA2_ID)
adam_epsilon = gr.Slider(1e-9, 1, value=1e-8, label="adam_epsilon",
info="The epsilon hyperparameter for the [`AdamW`] optimizer.",
interactive=True, elem_id=EPSILON_ID)
out_components: Set[Component] = set()
out_components.add(adam_beta1)
out_components.add(adam_beta2)
out_components.add(adam_epsilon)
return out_components
def add_optimizer1() -> Set[Component]:
optimizer = gr.Dropdown(
["adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision", "adafactor"],
value="adamw_torch_fused",
elem_id=OPTIMIZER_ID,
label="optimizer",
info="The optimizer to use: 'adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_apex_fused', "
"'adamw_anyprecision' or "
"'adafactor'. "
)
learning_rate = gr.Slider(1e-6, 1, step=0.001, value=2.0e-05, label="learning_rate",
info="The initial learning rate for AdamW.",
interactive=True, elem_id=LEARNING_RATE_ID)
weight_decay = gr.Slider(0, 1, value=0, label="weight_decay",
info="The weight decay to apply (if not zero) to all layers except all bias and "
"LayerNorm weights in [`AdamW`] optimizer.",
interactive=True, elem_id=WEIGHT_DECAY_ID)
out_components: Set[Component] = set()
out_components.add(optimizer)
out_components.add(learning_rate)
out_components.add(weight_decay)
return out_components
def add_sft_trainer_args() -> Set[Component]:
max_seq_length = gr.Slider(512, 3072, value=2048, label="max_seq_length",
info="The maximum sequence length to use for the `ConstantLengthDataset` and for "
"automatically "
"creating the Dataset.",
interactive=True, elem_id=MAX_SEQ_LENGTH_ID)
packing = gr.Checkbox(label="packing", value=True, interactive=True, elem_id=PACKING_ID,
info="This argument is used by the `ConstantLengthDataset` to pack the sequences of the "
"dataset.")
out_components: Set[Component] = set()
out_components.add(max_seq_length)
out_components.add(packing)
return out_components