inoid commited on
Commit
a668eef
1 Parent(s): c704363

Add all reference for training model with BioMistrall process to add

Browse files
Files changed (3) hide show
  1. app.py +33 -32
  2. requirements.txt +8 -2
  3. spanish_medica_llm.py +303 -1
app.py CHANGED
@@ -9,7 +9,7 @@ import sys
9
 
10
  import torch
11
 
12
- from spanish_medica_llm import run_training
13
 
14
  import gradio as gr
15
 
@@ -42,37 +42,38 @@ def train_model(*inputs):
42
  if "IS_SHARED_UI" in os.environ:
43
  raise gr.Error("This Space only works in duplicated instances")
44
 
45
- args_general = argparse.Namespace(
46
- image_captions_filename = True,
47
- train_text_encoder = True,
48
- #stop_text_encoder_training = stptxt,
49
- save_n_steps = 0,
50
- #pretrained_model_name_or_path = model_to_load,
51
- instance_data_dir="instance_images",
52
- #class_data_dir=class_data_dir,
53
- output_dir="output_model",
54
- instance_prompt="",
55
- seed=42,
56
- resolution=512,
57
- mixed_precision="fp16",
58
- train_batch_size=1,
59
- gradient_accumulation_steps=1,
60
- use_8bit_adam=True,
61
- learning_rate=2e-6,
62
- lr_scheduler="polynomial",
63
- lr_warmup_steps = 0,
64
- #max_train_steps=Training_Steps,
65
- )
66
- run_training(args_general)
67
- torch.cuda.empty_cache()
68
- #convert("output_model", "model.ckpt")
69
- #shutil.rmtree('instance_images')
70
- #shutil.make_archive("diffusers_model", 'zip', "output_model")
71
- #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
72
- # zipdir('output_model/', zipf)
73
- torch.cuda.empty_cache()
74
- return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
75
-
 
76
  def stop_model(*input):
77
  return f"Model with Gradio!"
78
 
 
9
 
10
  import torch
11
 
12
+ from spanish_medica_llm import run_training, run_training_process
13
 
14
  import gradio as gr
15
 
 
42
  if "IS_SHARED_UI" in os.environ:
43
  raise gr.Error("This Space only works in duplicated instances")
44
 
45
+ # args_general = argparse.Namespace(
46
+ # image_captions_filename = True,
47
+ # train_text_encoder = True,
48
+ # #stop_text_encoder_training = stptxt,
49
+ # save_n_steps = 0,
50
+ # #pretrained_model_name_or_path = model_to_load,
51
+ # instance_data_dir="instance_images",
52
+ # #class_data_dir=class_data_dir,
53
+ # output_dir="output_model",
54
+ # instance_prompt="",
55
+ # seed=42,
56
+ # resolution=512,
57
+ # mixed_precision="fp16",
58
+ # train_batch_size=1,
59
+ # gradient_accumulation_steps=1,
60
+ # use_8bit_adam=True,
61
+ # learning_rate=2e-6,
62
+ # lr_scheduler="polynomial",
63
+ # lr_warmup_steps = 0,
64
+ # #max_train_steps=Training_Steps,
65
+ # )
66
+ # run_training(args_general)
67
+ # torch.cuda.empty_cache()
68
+ # #convert("output_model", "model.ckpt")
69
+ # #shutil.rmtree('instance_images')
70
+ # #shutil.make_archive("diffusers_model", 'zip', "output_model")
71
+ # #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
72
+ # # zipdir('output_model/', zipf)
73
+ # torch.cuda.empty_cache()
74
+ # return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
75
+ run_training_process()
76
+ return f"Train Model Sucessful!!!"
77
  def stop_model(*input):
78
  return f"Model with Gradio!"
79
 
requirements.txt CHANGED
@@ -1,2 +1,8 @@
1
- transformers
2
- torch
 
 
 
 
 
 
 
1
+ transformers==4.38.0
2
+ torch>=2.1.1+cu113
3
+ trl @ git+https://github.com/huggingface/trl
4
+ peft
5
+ wandb
6
+ accelerate
7
+ datasets
8
+ bitsandbytes
spanish_medica_llm.py CHANGED
@@ -6,8 +6,60 @@ from pathlib import Path
6
  from typing import Optional
7
  import subprocess
8
  import sys
 
 
 
 
 
9
  import torch
10
- import transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def parse_args():
13
  parser = argparse.ArgumentParser(description="Simple example of a training script.")
@@ -248,3 +300,253 @@ def run_training(args_imported):
248
  args_default = parse_args()
249
  #args = merge_args(args_default, args_imported)
250
  return(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from typing import Optional
7
  import subprocess
8
  import sys
9
+ from datetime import datetime
10
+ from dataclasses import dataclass, field
11
+ from typing import Optional
12
+
13
+ import numpy as np
14
  import torch
15
+ from datasets import load_dataset, concatenate_datasets
16
+ from transformers import (
17
+ AutoModelForCausalLM,
18
+ AutoTokenizer,
19
+ BitsAndBytesConfig,
20
+ TrainingArguments,
21
+ Trainer,
22
+ DataCollatorForLanguageModeling
23
+ )
24
+
25
+ from accelerate import FullyShardedDataParallelPlugin, Accelerator
26
+ from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
27
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
28
+ import wandb
29
+ from trl import SFTTrainer
30
+
31
+
32
+ CHAT_ML_TEMPLATE_Mistral_7B_Instruct = """
33
+ {% if messages[0]['role'] == 'system' %}
34
+ {% set loop_messages = messages[1:] %}
35
+ {% set system_message = messages[0]['content'].strip() + '\n\n' %}
36
+ {% else %}
37
+ {% set loop_messages = messages %}
38
+ {% set system_message = '' %}
39
+ {% endif %}
40
+
41
+ {{ bos_token }}
42
+ {% for message in loop_messages %}
43
+ {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
44
+ {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
45
+ {% endif %}
46
+
47
+ {% if loop.index0 == 0 %}
48
+ {% set content = system_message + message['content'] %}
49
+ {% else %}
50
+ {% set content = message['content'] %}
51
+ {% endif %}
52
+
53
+ {% if message['role'] == 'user' %}
54
+ {{ '[INST] ' + content.strip() + ' [/INST]' }}
55
+ {% elif message['role'] == 'assistant' %}
56
+ {{ ' ' + content.strip() + ' ' + eos_token }}
57
+ {% endif %}
58
+ {% endfor %}
59
+ """
60
+
61
+
62
+
63
 
64
  def parse_args():
65
  parser = argparse.ArgumentParser(description="Simple example of a training script.")
 
300
  args_default = parse_args()
301
  #args = merge_args(args_default, args_imported)
302
  return(args)
303
+
304
+
305
+
306
+ TOKEN_NAME = "DeepESP/gpt2-spanish-medium"
307
+ TOKEN_MISTRAL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
308
+ SPANISH_MEDICA_LLM_DATASET = "somosnlp/spanish_medica_llm"
309
+
310
+ TOPIC_TYPE_DIAGNOSTIC = 'medical_diagnostic'
311
+ TOPIC_TYPE_TRATAMIENT = 'medical_topic'
312
+ FILTER_CRITERIA = [TOPIC_TYPE_DIAGNOSTIC, TOPIC_TYPE_TRATAMIENT]
313
+ CONTEXT_LENGTH = 256 #Max of tokens
314
+
315
+ MISTRAL_BASE_MODEL_ID = "BioMistral/BioMistral-7B"
316
+
317
+ MICRO_BATCH_SIZE = 16 #32 For other GPU BIGGER THAN T4
318
+ BATCH_SIZE = 64 #128 For other GPU BIGGER THAN T4
319
+ GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
320
+
321
+ PROJECT_NAME = "spanish-medica-llm"
322
+ BASE_MODEL_NAME = "biomistral"
323
+ run_name = BASE_MODEL_NAME + "-" + PROJECT_NAME
324
+ output_dir = "./" + run_name
325
+
326
+ HUB_MODEL_ID = 'somosnlp/spanish_medica_llm'
327
+ MAX_TRAINING_STEPS = int(1500/2)
328
+
329
+ def loadSpanishTokenizer():
330
+ """
331
+
332
+ """
333
+ #Load first the mistral used tokenizer
334
+ tokenizerMistrall = AutoTokenizer.from_pretrained(TOKEN_MISTRAL_NAME)
335
+
336
+ #Load second an spanish specialized tokenizer
337
+ tokenizer = AutoTokenizer.from_pretrained(
338
+ TOKEN_NAME,
339
+ eos_token = tokenizerMistrall.special_tokens_map['eos_token'],
340
+ bos_token = tokenizerMistrall.special_tokens_map['bos_token'],
341
+ unk_token = tokenizerMistrall.special_tokens_map['unk_token']
342
+ )
343
+ tokenizer.chat_template = CHAT_ML_TEMPLATE_Mistral_7B_Instruct
344
+
345
+ return tokenizer
346
+
347
+ def tokenize(element, tokenizer):
348
+ outputs = tokenizer(
349
+ element["raw_text"],
350
+ truncation = True,
351
+ max_length = CONTEXT_LENGTH,
352
+ return_overflowing_tokens = True,
353
+ return_length = True,
354
+ )
355
+ input_batch = []
356
+ for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
357
+ if length == CONTEXT_LENGTH:
358
+ input_batch.append(input_ids)
359
+ return {"input_ids": input_batch}
360
+
361
+ def splitDatasetInTestValid(dataset):
362
+ """
363
+ """
364
+
365
+ if dataset == None or dataset['train'] == None:
366
+ return dataset
367
+ elif dataset['test'] == None:
368
+ return None
369
+ else:
370
+ test_eval = dataset['test'].train_test_split(test_size=0.001)
371
+ eval_dataset = test_eval['train']
372
+ test_dataset = test_eval['test']
373
+
374
+ return (dataset['train'], eval_dataset, test_dataset)
375
+
376
+ def loadSpanishDataset():
377
+ spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
378
+ spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
379
+ spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
380
+ return spanishMedicaLllmDataset
381
+
382
+ ##See Jupyter Notebook for change CONTEXT_LENGTH size
383
+
384
+ def accelerateConfigModel():
385
+ """
386
+ Only with GPU support
387
+ RuntimeError: There are currently no available devices found, must be one of 'XPU', 'CUDA', or 'NPU'.
388
+ """
389
+ fsdp_plugin = FullyShardedDataParallelPlugin(
390
+ state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
391
+ optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
392
+ )
393
+
394
+ return Accelerator(fsdp_plugin=fsdp_plugin)
395
+
396
+ def getTokenizedDataset(dataset, tokenizer):
397
+ if dataset == None or tokenizer == None:
398
+ return dataset
399
+
400
+ return dataset.map(
401
+ tokenize,
402
+ batched = True,
403
+ remove_columns = dataset["train"].column_names
404
+ )
405
+
406
+ def loadBaseModel(base_model_id):
407
+
408
+ if base_model_id in [ "", None]:
409
+ return None
410
+ else:
411
+ bnb_config = BitsAndBytesConfig(
412
+ load_in_4bit = True,
413
+ bnb_4bit_quant_type = "nf4",
414
+ bnb_4bit_use_double_quant = True,
415
+ bnb_4bit_compute_dtype = torch.bfloat16
416
+ )
417
+
418
+ model = AutoModelForCausalLM.from_pretrained(
419
+ base_model_id,
420
+ quantization_config = bnb_config
421
+ )
422
+
423
+ model.gradient_checkpointing_enable()
424
+ model = prepare_model_for_kbit_training(model)
425
+
426
+ return model
427
+
428
+ def print_trainable_parameters(model):
429
+ """
430
+ Prints the number of trainable parameters in the model.
431
+ """
432
+ trainable_params = 0
433
+ all_param = 0
434
+ for _, param in model.named_parameters():
435
+ all_param += param.numel()
436
+ if param.requires_grad:
437
+ trainable_params += param.numel()
438
+ print(
439
+ f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
440
+ )
441
+
442
+ def modelLoraConfigBioMistral(model):
443
+ """
444
+ r is the rank of the low-rank matrix used in the adapters, which thus controls
445
+ the number of parameters trained. A higher rank will allow for more expressivity, but there is a
446
+ compute tradeoff.
447
+ alpha is the scaling factor for the learned weights. The weight matrix is scaled by
448
+ alpha/r, and thus a higher value for alpha assigns more weight to the LoRA activations.
449
+ The values used in the QLoRA paper werer=64 and lora_alpha=16,
450
+ and these are said to generalize well, but we will user=8 and lora_alpha=16 so that we have more emphasis on the new fine-tuned data while also reducing computational complexity.
451
+ """
452
+ if model == None:
453
+ return model
454
+ else:
455
+ config = LoraConfig(
456
+ r=8,
457
+ lora_alpha=16,
458
+ target_modules=[
459
+ "q_proj",
460
+ "k_proj",
461
+ "v_proj",
462
+ "o_proj",
463
+ "gate_proj",
464
+ "up_proj",
465
+ "down_proj",
466
+ "lm_head",
467
+ ],
468
+ bias="none",
469
+ lora_dropout=0.05, # Conventional
470
+ task_type="CAUSAL_LM",
471
+ )
472
+
473
+ model = get_peft_model(model, config)
474
+ print_trainable_parameters(model)
475
+
476
+ accelerator = accelerateConfigModel()
477
+ # Apply the accelerator. You can comment this out to remove the accelerator.
478
+ model = accelerator.prepare_model(model)
479
+ return (model)
480
+
481
+
482
+ # A note on training. You can set the max_steps to be high initially, and examine at what step your
483
+ # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
484
+ # to perform. For example, say you start with 1000 steps, and find that at around 500 steps
485
+ # the model starts overfitting - the validation loss goes up (bad) while the training
486
+ # loss goes down significantly, meaning the model is learning the training set really well,
487
+ # but is unable to generalize to new datapoints. Therefore, 500 steps would be your sweet spot,
488
+ # so you would use the checkpoint-500 model repo in your output dir (biomistral-medqa-finetune)
489
+ # as your final model in step 6 below.
490
+
491
+
492
+
493
+ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
494
+ if basemodel is None or dataset is None or tokenizer is None:
495
+ return None
496
+ else:
497
+ tokenizer.pad_token = tokenizer.eos_token
498
+ data_collator_pretrain = DataCollatorForLanguageModeling(tokenizer, mlm = False)
499
+
500
+ training_args = transformers.TrainingArguments(
501
+ output_dir=output_dir,
502
+ push_to_hub = True,
503
+ hub_private_repo = False,
504
+ hub_model_id = HUB_MODEL_ID,
505
+ warmup_steps =5,
506
+ per_device_train_batch_size = MICRO_BATCH_SIZE,
507
+ per_device_eval_batch_size=1,
508
+ #gradient_checkpointing=True,
509
+ gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
510
+ max_steps = MAX_TRAINING_STEPS,
511
+ learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate
512
+ logging_steps = 50,
513
+ optim="paged_adamw_8bit",
514
+ logging_dir="./logs", # Directory for storing logs
515
+ save_strategy = "steps", # Save the model checkpoint every logging step
516
+ save_steps = 50, # Save checkpoints every 50 steps
517
+ evaluation_strategy = "steps", # Evaluate the model every logging step
518
+ eval_steps = 50, # Evaluate and save checkpoints every 50 steps
519
+ do_eval = True, # Perform evaluation at the end of training
520
+ #report_to="wandb", # Comment this out if you don't want to use weights & baises
521
+ run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional)
522
+ fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter
523
+ bf16=False
524
+ )
525
+
526
+ trainer = transformers.Trainer(
527
+ model= basemodel,
528
+ train_dataset = dataset['train'],
529
+ eval_dataset = eval_dataset,
530
+ args = training_args,
531
+ data_collator = data_collator_pretrain
532
+ )
533
+
534
+ basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference!
535
+ trainer.train()
536
+
537
+
538
+ trainer.push_to_hub()
539
+
540
+
541
+ def run_training_process():
542
+
543
+ tokenizer = loadSpanishTokenizer()
544
+ medicalSpanishDataset = loadSpanishDataset()
545
+ train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
546
+ getTokenizedDataset( medicalSpanishDataset, tokenizer)
547
+ )
548
+
549
+ base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
550
+ base_model = modelLoraConfigBioMistral(base_model)
551
+
552
+ configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)