Spaces:

somosnlp
/

SpanishMedicaLLM

Runtime error

App Files Files Community

inoid commited on Mar 28, 2024

Commit

a668eef

1 Parent(s): c704363

Add all reference for training model with BioMistrall process to add

Browse files

Files changed (3) hide show

app.py +33 -32
requirements.txt +8 -2
spanish_medica_llm.py +303 -1

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import sys
 import torch
-from spanish_medica_llm import run_training
 import gradio as gr
@@ -42,37 +42,38 @@ def train_model(*inputs):
     if "IS_SHARED_UI" in os.environ:
         raise gr.Error("This Space only works in duplicated instances")
-    args_general = argparse.Namespace(
-                image_captions_filename = True,
-                train_text_encoder = True,
-                #stop_text_encoder_training = stptxt,
-                save_n_steps = 0,
-                #pretrained_model_name_or_path = model_to_load,
-                instance_data_dir="instance_images",
-                #class_data_dir=class_data_dir,
-                output_dir="output_model",
-                instance_prompt="",
-                seed=42,
-                resolution=512,
-                mixed_precision="fp16",
-                train_batch_size=1,
-                gradient_accumulation_steps=1,
-                use_8bit_adam=True,
-                learning_rate=2e-6,
-                lr_scheduler="polynomial",
-                lr_warmup_steps = 0,
-                #max_train_steps=Training_Steps,
-    )
-    run_training(args_general)
-    torch.cuda.empty_cache()
-    #convert("output_model", "model.ckpt")
-    #shutil.rmtree('instance_images')
-    #shutil.make_archive("diffusers_model", 'zip', "output_model")
-    #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
-    #    zipdir('output_model/', zipf)
-    torch.cuda.empty_cache()
-    return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
 def stop_model(*input):
     return f"Model with Gradio!"

 import torch
+from spanish_medica_llm import run_training, run_training_process
 import gradio as gr
     if "IS_SHARED_UI" in os.environ:
         raise gr.Error("This Space only works in duplicated instances")
+    # args_general = argparse.Namespace(
+    #             image_captions_filename = True,
+    #             train_text_encoder = True,
+    #             #stop_text_encoder_training = stptxt,
+    #             save_n_steps = 0,
+    #             #pretrained_model_name_or_path = model_to_load,
+    #             instance_data_dir="instance_images",
+    #             #class_data_dir=class_data_dir,
+    #             output_dir="output_model",
+    #             instance_prompt="",
+    #             seed=42,
+    #             resolution=512,
+    #             mixed_precision="fp16",
+    #             train_batch_size=1,
+    #             gradient_accumulation_steps=1,
+    #             use_8bit_adam=True,
+    #             learning_rate=2e-6,
+    #             lr_scheduler="polynomial",
+    #             lr_warmup_steps = 0,
+    #             #max_train_steps=Training_Steps,
+    # )
+    # run_training(args_general)
+    # torch.cuda.empty_cache()
+    # #convert("output_model", "model.ckpt")
+    # #shutil.rmtree('instance_images')
+    # #shutil.make_archive("diffusers_model", 'zip', "output_model")
+    # #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
+    # #    zipdir('output_model/', zipf)
+    # torch.cuda.empty_cache()
+    # return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
+    run_training_process()
+    return f"Train Model Sucessful!!!"
 def stop_model(*input):
     return f"Model with Gradio!"

requirements.txt CHANGED Viewed

@@ -1,2 +1,8 @@
-transformers
-torch

+transformers==4.38.0
+torch>=2.1.1+cu113
+trl @ git+https://github.com/huggingface/trl
+peft
+wandb
+accelerate
+datasets
+bitsandbytes

spanish_medica_llm.py CHANGED Viewed

@@ -6,8 +6,60 @@ from pathlib import Path
 from typing import Optional
 import subprocess
 import sys
 import torch
-import transformers
 def parse_args():
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
@@ -248,3 +300,253 @@ def run_training(args_imported):
     args_default = parse_args()
     #args = merge_args(args_default, args_imported)
     return(args)

 from typing import Optional
 import subprocess
 import sys
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
 import torch
+from datasets import load_dataset, concatenate_datasets
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling
+)
+from accelerate import FullyShardedDataParallelPlugin, Accelerator
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+import wandb
+from trl import SFTTrainer
+CHAT_ML_TEMPLATE_Mistral_7B_Instruct = """
+{% if messages[0]['role'] == 'system' %}
+    {% set loop_messages = messages[1:] %}
+    {% set system_message = messages[0]['content'].strip() + '\n\n' %}
+{% else %}
+    {% set loop_messages = messages %}
+    {% set system_message = '' %}
+{% endif %}
+{{ bos_token }}
+{% for message in loop_messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+    {% if loop.index0 == 0 %}
+        {% set content = system_message + message['content'] %}
+    {% else %}
+        {% set content = message['content'] %}
+    {% endif %}
+    {% if message['role'] == 'user' %}
+        {{ '[INST] ' + content.strip() + ' [/INST]' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ ' '  + content.strip() + ' ' + eos_token }}
+    {% endif %}
+{% endfor %}
+"""
 def parse_args():
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     args_default = parse_args()
     #args = merge_args(args_default, args_imported)
     return(args)
+TOKEN_NAME = "DeepESP/gpt2-spanish-medium"
+TOKEN_MISTRAL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
+SPANISH_MEDICA_LLM_DATASET = "somosnlp/spanish_medica_llm"
+TOPIC_TYPE_DIAGNOSTIC = 'medical_diagnostic'
+TOPIC_TYPE_TRATAMIENT = 'medical_topic'
+FILTER_CRITERIA = [TOPIC_TYPE_DIAGNOSTIC, TOPIC_TYPE_TRATAMIENT]
+CONTEXT_LENGTH = 256 #Max of tokens
+MISTRAL_BASE_MODEL_ID = "BioMistral/BioMistral-7B"
+MICRO_BATCH_SIZE = 16  #32 For other GPU BIGGER THAN T4
+BATCH_SIZE = 64 #128 For other GPU BIGGER THAN T4
+GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
+PROJECT_NAME = "spanish-medica-llm"
+BASE_MODEL_NAME = "biomistral"
+run_name = BASE_MODEL_NAME + "-" + PROJECT_NAME
+output_dir = "./" + run_name
+HUB_MODEL_ID = 'somosnlp/spanish_medica_llm'
+MAX_TRAINING_STEPS = int(1500/2)
+def loadSpanishTokenizer():
+    """
+    """
+    #Load first the mistral used tokenizer
+    tokenizerMistrall = AutoTokenizer.from_pretrained(TOKEN_MISTRAL_NAME)
+    #Load second an spanish specialized tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        TOKEN_NAME,
+        eos_token = tokenizerMistrall.special_tokens_map['eos_token'],
+        bos_token = tokenizerMistrall.special_tokens_map['bos_token'],
+        unk_token = tokenizerMistrall.special_tokens_map['unk_token']
+    )
+    tokenizer.chat_template = CHAT_ML_TEMPLATE_Mistral_7B_Instruct
+    return tokenizer
+def tokenize(element, tokenizer):
+    outputs = tokenizer(
+        element["raw_text"],
+        truncation = True,
+        max_length = CONTEXT_LENGTH,
+        return_overflowing_tokens = True,
+        return_length = True,
+    )
+    input_batch = []
+    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
+        if length == CONTEXT_LENGTH:
+            input_batch.append(input_ids)
+    return {"input_ids": input_batch}
+def splitDatasetInTestValid(dataset):
+    """
+    """
+    if dataset == None or dataset['train'] == None:
+        return dataset
+    elif dataset['test'] == None:
+        return None
+    else:
+        test_eval = dataset['test'].train_test_split(test_size=0.001)
+        eval_dataset = test_eval['train']
+        test_dataset = test_eval['test']
+        return (dataset['train'], eval_dataset, test_dataset)
+def loadSpanishDataset():
+    spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
+    spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in  FILTER_CRITERIA)
+    spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
+    return spanishMedicaLllmDataset
+    ##See Jupyter Notebook for change CONTEXT_LENGTH size
+def accelerateConfigModel():
+    """
+      Only with GPU support
+        RuntimeError: There are currently no available devices found, must be one of 'XPU', 'CUDA', or 'NPU'.
+    """
+    fsdp_plugin = FullyShardedDataParallelPlugin(
+        state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
+        optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
+    )
+    return Accelerator(fsdp_plugin=fsdp_plugin)
+def getTokenizedDataset(dataset, tokenizer):
+    if  dataset == None or tokenizer == None:
+        return dataset
+    return  dataset.map(
+        tokenize,
+        batched = True,
+        remove_columns = dataset["train"].column_names
+    )
+def loadBaseModel(base_model_id):
+    if base_model_id in [ "", None]:
+      return None
+    else:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit = True,
+            bnb_4bit_quant_type = "nf4",
+            bnb_4bit_use_double_quant = True,
+            bnb_4bit_compute_dtype = torch.bfloat16
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+                base_model_id,
+                quantization_config = bnb_config
+            )
+        model.gradient_checkpointing_enable()
+        model = prepare_model_for_kbit_training(model)
+        return model
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+def modelLoraConfigBioMistral(model):
+    """
+      r is the rank of the low-rank matrix used in the adapters, which thus controls
+      the number of parameters trained. A higher rank will allow for more expressivity, but there is a
+      compute tradeoff.
+      alpha is the scaling factor for the learned weights. The weight matrix is scaled by
+      alpha/r, and thus a higher value for alpha assigns more weight to the LoRA activations.
+      The values used in the QLoRA paper werer=64 and lora_alpha=16,
+       and these are said to generalize well, but we will user=8 and lora_alpha=16 so that we have more emphasis on the new fine-tuned data while also reducing computational complexity.
+    """
+    if model == None:
+        return model
+    else:
+        config = LoraConfig(
+            r=8,
+            lora_alpha=16,
+            target_modules=[
+                "q_proj",
+                "k_proj",
+                "v_proj",
+                "o_proj",
+                "gate_proj",
+                "up_proj",
+                "down_proj",
+                "lm_head",
+            ],
+            bias="none",
+            lora_dropout=0.05,  # Conventional
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, config)
+        print_trainable_parameters(model)
+        accelerator = accelerateConfigModel()
+       # Apply the accelerator. You can comment this out to remove the accelerator.
+        model = accelerator.prepare_model(model)
+        return (model)
+# A note on training. You can set the max_steps to be high initially, and examine at what step your
+# model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
+# to perform. For example, say you start with 1000 steps, and find that at around 500 steps
+# the model starts overfitting - the validation loss goes up (bad) while the training
+# loss goes down significantly, meaning the model is learning the training set really well,
+# but is unable to generalize to new datapoints. Therefore, 500 steps would be your sweet spot,
+# so you would use the checkpoint-500 model repo in your output dir (biomistral-medqa-finetune)
+# as your final model in step 6 below.
+def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
+    if basemodel is None or dataset is None or tokenizer is None:
+        return None
+    else:
+        tokenizer.pad_token = tokenizer.eos_token
+        data_collator_pretrain = DataCollatorForLanguageModeling(tokenizer, mlm = False)
+        training_args = transformers.TrainingArguments(
+                output_dir=output_dir,
+                push_to_hub = True,
+                hub_private_repo = False,
+                hub_model_id = HUB_MODEL_ID,
+                warmup_steps  =5,
+                per_device_train_batch_size = MICRO_BATCH_SIZE,
+                per_device_eval_batch_size=1,
+                #gradient_checkpointing=True,
+                gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
+                max_steps = MAX_TRAINING_STEPS,
+                learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate
+                logging_steps = 50,
+                optim="paged_adamw_8bit",
+                logging_dir="./logs",        # Directory for storing logs
+                save_strategy = "steps",       # Save the model checkpoint every logging step
+                save_steps = 50,                # Save checkpoints every 50 steps
+                evaluation_strategy = "steps", # Evaluate the model every logging step
+                eval_steps = 50,               # Evaluate and save checkpoints every 50 steps
+                do_eval = True,                # Perform evaluation at the end of training
+                #report_to="wandb",           # Comment this out if you don't want to use weights & baises
+                run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" ,         # Name of the W&B run (optional)
+                fp16=True,  #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter
+                bf16=False
+            )
+        trainer = transformers.Trainer(
+                     model= basemodel,
+                     train_dataset = dataset['train'],
+                     eval_dataset = eval_dataset,
+                     args = training_args,
+                     data_collator = data_collator_pretrain
+                )
+        basemodel.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+        trainer.train()
+        trainer.push_to_hub()
+def run_training_process():
+    tokenizer = loadSpanishTokenizer()
+    medicalSpanishDataset =  loadSpanishDataset()
+    train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
+        getTokenizedDataset( medicalSpanishDataset, tokenizer)
+       )
+    base_model =  loadBaseModel(MISTRAL_BASE_MODEL_ID)
+    base_model = modelLoraConfigBioMistral(base_model)
+    configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)