Spaces:
Runtime error
Runtime error
Add finnetuing process configuration to model
Browse files- app.py +12 -33
- spanish_medica_llm.py +157 -3
app.py
CHANGED
@@ -10,7 +10,7 @@ import sys
|
|
10 |
import torch
|
11 |
|
12 |
|
13 |
-
from spanish_medica_llm import run_training, run_training_process
|
14 |
|
15 |
import gradio as gr
|
16 |
|
@@ -45,41 +45,18 @@ def train_model(*inputs):
|
|
45 |
if "IS_SHARED_UI" in os.environ:
|
46 |
raise gr.Error("This Space only works in duplicated instances")
|
47 |
|
48 |
-
|
49 |
-
# image_captions_filename = True,
|
50 |
-
# train_text_encoder = True,
|
51 |
-
# #stop_text_encoder_training = stptxt,
|
52 |
-
# save_n_steps = 0,
|
53 |
-
# #pretrained_model_name_or_path = model_to_load,
|
54 |
-
# instance_data_dir="instance_images",
|
55 |
-
# #class_data_dir=class_data_dir,
|
56 |
-
# output_dir="output_model",
|
57 |
-
# instance_prompt="",
|
58 |
-
# seed=42,
|
59 |
-
# resolution=512,
|
60 |
-
# mixed_precision="fp16",
|
61 |
-
# train_batch_size=1,
|
62 |
-
# gradient_accumulation_steps=1,
|
63 |
-
# use_8bit_adam=True,
|
64 |
-
# learning_rate=2e-6,
|
65 |
-
# lr_scheduler="polynomial",
|
66 |
-
# lr_warmup_steps = 0,
|
67 |
-
# #max_train_steps=Training_Steps,
|
68 |
-
# )
|
69 |
-
# run_training(args_general)
|
70 |
-
# torch.cuda.empty_cache()
|
71 |
-
# #convert("output_model", "model.ckpt")
|
72 |
-
# #shutil.rmtree('instance_images')
|
73 |
-
# #shutil.make_archive("diffusers_model", 'zip', "output_model")
|
74 |
-
# #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
|
75 |
-
# # zipdir('output_model/', zipf)
|
76 |
-
# torch.cuda.empty_cache()
|
77 |
-
# return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
|
78 |
-
run_training_process()
|
79 |
-
|
80 |
|
81 |
return f"Train Model Sucessful!!!"
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
def stop_model(*input):
|
84 |
return f"Model with Gradio!"
|
85 |
|
@@ -93,6 +70,8 @@ with gr.Blocks() as demo:
|
|
93 |
btn_response.click(fn=generate_model, inputs=inp, outputs=out)
|
94 |
btn_train = gr.Button("Train Model")
|
95 |
btn_train.click(fn=train_model, inputs=[], outputs=out)
|
|
|
|
|
96 |
btn_evaluate = gr.Button("Evaluate Model")
|
97 |
btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
|
98 |
btn_stop = gr.Button("Stop Model")
|
|
|
10 |
import torch
|
11 |
|
12 |
|
13 |
+
from spanish_medica_llm import run_training, run_training_process, run_finnetuning_process
|
14 |
|
15 |
import gradio as gr
|
16 |
|
|
|
45 |
if "IS_SHARED_UI" in os.environ:
|
46 |
raise gr.Error("This Space only works in duplicated instances")
|
47 |
|
48 |
+
run_training_process()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
return f"Train Model Sucessful!!!"
|
51 |
|
52 |
+
def finnetuning_model(*inputs):
|
53 |
+
if "IS_SHARED_UI" in os.environ:
|
54 |
+
raise gr.Error("This Space only works in duplicated instances")
|
55 |
+
|
56 |
+
run_finnetuning_process()
|
57 |
+
|
58 |
+
return f"Finnetuning Model Sucessful!!!"
|
59 |
+
|
60 |
def stop_model(*input):
|
61 |
return f"Model with Gradio!"
|
62 |
|
|
|
70 |
btn_response.click(fn=generate_model, inputs=inp, outputs=out)
|
71 |
btn_train = gr.Button("Train Model")
|
72 |
btn_train.click(fn=train_model, inputs=[], outputs=out)
|
73 |
+
btn_finnetuning = gr.Button("Finnetuning Model")
|
74 |
+
btn_finnetuning.click(fn=finnetuning_model, inputs=[], outputs=out)
|
75 |
btn_evaluate = gr.Button("Evaluate Model")
|
76 |
btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
|
77 |
btn_stop = gr.Button("Stop Model")
|
spanish_medica_llm.py
CHANGED
@@ -331,6 +331,45 @@ MAX_TRAINING_STEPS = 2
|
|
331 |
|
332 |
TOKEN_NAME = TOKEN_MISTRAL_NAME
|
333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
def loadSpanishTokenizer():
|
335 |
"""
|
336 |
|
@@ -379,12 +418,32 @@ def splitDatasetInTestValid(dataset):
|
|
379 |
return (dataset['train'], eval_dataset, test_dataset)
|
380 |
|
381 |
def loadSpanishDataset():
|
|
|
382 |
spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
|
383 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
|
384 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
|
385 |
return spanishMedicaLllmDataset
|
386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
##See Jupyter Notebook for change CONTEXT_LENGTH size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
def accelerateConfigModel():
|
390 |
"""
|
@@ -483,6 +542,26 @@ def modelLoraConfigBioMistral(model):
|
|
483 |
model = accelerator.prepare_model(model)
|
484 |
return (model)
|
485 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
|
487 |
# A note on training. You can set the max_steps to be high initially, and examine at what step your
|
488 |
# model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
|
@@ -541,10 +620,85 @@ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
|
|
541 |
|
542 |
trainer.push_to_hub()
|
543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
|
|
|
|
|
546 |
|
547 |
def run_training_process():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
548 |
#Loggin to Huggin Face
|
549 |
login(token = os.environ.get('HG_FACE_TOKEN'))
|
550 |
os.environ['WANDB_DISABLED'] = 'true'
|
@@ -554,7 +708,7 @@ def run_training_process():
|
|
554 |
getTokenizedDataset( medicalSpanishDataset, tokenizer)
|
555 |
)
|
556 |
|
557 |
-
base_model = loadBaseModel(
|
558 |
-
base_model = modelLoraConfigBioMistral(base_model)
|
559 |
|
560 |
-
|
|
|
|
331 |
|
332 |
TOKEN_NAME = TOKEN_MISTRAL_NAME
|
333 |
|
334 |
+
def get_chat_format(element):
|
335 |
+
"""
|
336 |
+
Processes a single sample from the alpaca dataset to structure it for chatbot training.
|
337 |
+
|
338 |
+
This function transforms the dataset sample into a format suitable for training,
|
339 |
+
where each message is categorized by its role in the conversation (system, input, user, assistant).
|
340 |
+
It initializes the conversation with a system message, then conditionally adds an input message,
|
341 |
+
follows with the user's instruction, and finally, the assistant's output based on the provided inputs.
|
342 |
+
|
343 |
+
Parameters
|
344 |
+
----------
|
345 |
+
sample : dict
|
346 |
+
A dictionary representing a single sample from the dataset. It must contain
|
347 |
+
keys corresponding to input and output components of the conversation.
|
348 |
+
|
349 |
+
Returns
|
350 |
+
-------
|
351 |
+
dict
|
352 |
+
A modified dictionary with a 'messages' key that contains a list of ordered messages,
|
353 |
+
each annotated with its role in the conversation.
|
354 |
+
"""
|
355 |
+
|
356 |
+
prompt_template="""A partir del caso cl铆nico que se expone a continuaci贸n, tu tarea es la siguiente.
|
357 |
+
Como m茅dico experto, tu tarea es la de diagnosticar al paciente en base al caso cl铆nico. Responde 煤nicamente con el diagn贸stico para el paciente de forma concisa.
|
358 |
+
Caso cl铆nico: {caso_clinico}
|
359 |
+
"""
|
360 |
+
# c贸mo usarlo con un LLM:
|
361 |
+
|
362 |
+
system_prompt = "Eres un experto en medicina que realiza diagn贸sticos en base a casos cl铆nicos."
|
363 |
+
|
364 |
+
messages = [
|
365 |
+
{"role": "system", "content": system_prompt},
|
366 |
+
{"role": "user", "content": prompt_template.format(caso_clinico=element["raw_text"])},
|
367 |
+
{"role": "assistant", "content": element["topic"]},
|
368 |
+
]
|
369 |
+
|
370 |
+
element["raw_text"] = messages
|
371 |
+
return element
|
372 |
+
|
373 |
def loadSpanishTokenizer():
|
374 |
"""
|
375 |
|
|
|
418 |
return (dataset['train'], eval_dataset, test_dataset)
|
419 |
|
420 |
def loadSpanishDataset():
|
421 |
+
|
422 |
spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
|
423 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
|
424 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
|
425 |
return spanishMedicaLllmDataset
|
426 |
|
427 |
+
def loadSpanishDatasetFinnetuning():
|
428 |
+
|
429 |
+
spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
|
430 |
+
spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] in FILTER_CRITERIA)
|
431 |
+
return spanishMedicaLllmDataset
|
432 |
+
|
433 |
##See Jupyter Notebook for change CONTEXT_LENGTH size
|
434 |
+
def applyChatInstructFormat(dataset, filterColumns = ['raw_text', 'topic']):
|
435 |
+
"""
|
436 |
+
Apply instruccion chat_template
|
437 |
+
"""
|
438 |
+
if dataset == None:
|
439 |
+
return dataset
|
440 |
+
else:
|
441 |
+
dataset = dataset.remove_columns([col for col in dataset.features if col not in filterColumns])
|
442 |
+
return dataset.map(
|
443 |
+
get_chat_format,
|
444 |
+
batched=False,
|
445 |
+
num_proc=4
|
446 |
+
)
|
447 |
|
448 |
def accelerateConfigModel():
|
449 |
"""
|
|
|
542 |
model = accelerator.prepare_model(model)
|
543 |
return (model)
|
544 |
|
545 |
+
def getLoraConfiguration():
|
546 |
+
"""
|
547 |
+
"""
|
548 |
+
return LoraConfig(
|
549 |
+
r=8,
|
550 |
+
lora_alpha=16,
|
551 |
+
target_modules=[
|
552 |
+
"q_proj",
|
553 |
+
"k_proj",
|
554 |
+
"v_proj",
|
555 |
+
"o_proj",
|
556 |
+
"gate_proj",
|
557 |
+
"up_proj",
|
558 |
+
"down_proj",
|
559 |
+
"lm_head",
|
560 |
+
],
|
561 |
+
bias="none",
|
562 |
+
lora_dropout=0.05, # Conventional
|
563 |
+
task_type="CAUSAL_LM",
|
564 |
+
)
|
565 |
|
566 |
# A note on training. You can set the max_steps to be high initially, and examine at what step your
|
567 |
# model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
|
|
|
620 |
|
621 |
trainer.push_to_hub()
|
622 |
|
623 |
+
def configAndRunFineTuning(basemodel, dataset, eval_dataset, tokenizer):
|
624 |
+
if basemodel is None or dataset is None or tokenizer is None:
|
625 |
+
return None
|
626 |
+
else:
|
627 |
+
tokenizer.pad_token = tokenizer.eos_token
|
628 |
+
|
629 |
+
|
630 |
+
training_args = TrainingArguments(
|
631 |
+
output_dir=output_dir,
|
632 |
+
push_to_hub = True,
|
633 |
+
hub_private_repo = False,
|
634 |
+
hub_model_id = HUB_MODEL_ID,
|
635 |
+
warmup_steps = 5,
|
636 |
+
per_device_train_batch_size = MICRO_BATCH_SIZE,
|
637 |
+
per_device_eval_batch_size=1,
|
638 |
+
#gradient_checkpointing=True,
|
639 |
+
gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
|
640 |
+
num_train_epochs = 1,
|
641 |
+
learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate
|
642 |
+
logging_steps = 5,
|
643 |
+
optim="paged_adamw_8bit",
|
644 |
+
logging_dir="./logs", # Directory for storing logs
|
645 |
+
save_strategy = "steps", # Save the model checkpoint every logging step
|
646 |
+
save_steps = 50, # Save checkpoints every 50 steps
|
647 |
+
evaluation_strategy = "steps", # Evaluate the model every logging step
|
648 |
+
eval_steps = 50, # Evaluate and save checkpoints every 50 steps
|
649 |
+
do_eval = True, # Perform evaluation at the end of training
|
650 |
+
eval_steps=50,
|
651 |
+
save_total_limit=2,
|
652 |
+
remove_unused_columns = True,
|
653 |
+
report_to = None, # Comment this out if you don't want to use weights & baises
|
654 |
+
run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional)
|
655 |
+
fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter
|
656 |
+
bf16=False
|
657 |
+
)
|
658 |
+
|
659 |
+
trainer = SFTTrainer(
|
660 |
+
model=basemodel,
|
661 |
+
train_dataset = dataset,
|
662 |
+
eval_dataset = eval_dataset,
|
663 |
+
peft_config = getLoraConfiguration(),
|
664 |
+
dataset_text_field = "raw_text",
|
665 |
+
max_seq_length = 1024, #512
|
666 |
+
tokenizer = tokenizer,
|
667 |
+
args = training_args,
|
668 |
+
dataset_kwargs={
|
669 |
+
"add_special_tokens": False, # We template with special tokens
|
670 |
+
"append_concat_token": False, # No need to add additional separator token
|
671 |
+
},
|
672 |
+
packing=True
|
673 |
+
)
|
674 |
+
basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference!
|
675 |
+
trainer.train()
|
676 |
|
677 |
|
678 |
+
trainer.push_to_hub()
|
679 |
+
|
680 |
|
681 |
def run_training_process():
|
682 |
+
#Loggin to Huggin Face
|
683 |
+
login(token = os.environ.get('HG_FACE_TOKEN'))
|
684 |
+
os.environ['WANDB_DISABLED'] = 'true'
|
685 |
+
tokenizer = loadSpanishTokenizer()
|
686 |
+
medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
|
687 |
+
medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
|
688 |
+
|
689 |
+
# train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
|
690 |
+
# getTokenizedDataset( medicalSpanishDataset, tokenizer)
|
691 |
+
# )
|
692 |
+
|
693 |
+
|
694 |
+
train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
|
695 |
+
|
696 |
+
base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
|
697 |
+
base_model = modelLoraConfigBioMistral(base_model)
|
698 |
+
|
699 |
+
configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)
|
700 |
+
|
701 |
+
def run_finnetuning_process():
|
702 |
#Loggin to Huggin Face
|
703 |
login(token = os.environ.get('HG_FACE_TOKEN'))
|
704 |
os.environ['WANDB_DISABLED'] = 'true'
|
|
|
708 |
getTokenizedDataset( medicalSpanishDataset, tokenizer)
|
709 |
)
|
710 |
|
711 |
+
base_model = loadBaseModel(HUB_MODEL_ID)
|
|
|
712 |
|
713 |
+
configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer)
|
714 |
+
|