File size: 9,168 Bytes

5e845b4

# -*- coding: utf-8 -*-
"""Training Llama 3.1.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/19LthnXISqvXgzE-1S2crf-PtTv3OaRmo

# **TRAINING DEL MODELO**

**Instalación de dependencias**
"""

# Commented out IPython magic to ensure Python compatibility.
# %%capture
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
# !pip install datasets # Se instalan

from unsloth import FastLanguageModel # Normalmente se utiliza transformers, pero esta es una librería que permite finetunear rápidamente modelos de lenguaje
import torch
max_seq_length = 2048 # Se puede elegir cualquier largo. Esta librería permite autoscaling (escala automáticamente si el dataset cuenta con un máximo mayor)
dtype = None
load_in_4bit = True # Cuantificación 4bit para reducir el uso de memoria

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct", # Modelo Llama 3.1 pre-entrenado para la respuesta a instrucciones
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

"""**Definición de los Lora Adapters**"""

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # Configura el número de parámetros de rango para LoRA. Se recomienda usar valores como 8, 16, 32, 64, 128.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,  # Establece el valor de alpha para LoRA. Es un hiperparámetro que controla la intensidad de la adaptación de LoRA.
    lora_dropout = 0,  # Configura la tasa de abandono para LoRA. Se puede usar cualquier valor, pero 0 es la configuración más optimizada.
    bias = "none",     # Determina el tipo de sesgo para LoRA. La opción "none" es la más optimizada y elimina el sesgo en el modelo.
    # [NUEVO] La opción "unsloth" reduce el uso de VRAM en un 30% y permite tamaños de lote hasta 2 veces mayores.
    use_gradient_checkpointing = "unsloth",  # Usa True o "unsloth" para habilitar el registro de puntos de control de gradientes, lo que es útil para contextos muy largos.
    random_state = 3407,  # Establece la semilla para la generación de números aleatorios, asegurando reproducibilidad en el entrenamiento.
    use_rslora = False,  # Indica si se utiliza LoRA con rango estabilizado, que puede mejorar la estabilidad del entrenamiento.
    loftq_config = None, # Configura LoftQ si se requiere. LoftQ es una técnica adicional que puede ser utilizada en el modelo.
)

"""**Preparación del dataset**"""

alpaca_prompt = """Below is an instruction that describes a task, with an input that gives more context. Write a response that appropriately completes the request.

### Instruction:
Below you have a sentence in quotation marks. Provide the syntactic category of each word in the context of the sentence.

### Sentence:
"{}"

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    length= len(examples["sentence"])
    sentences = examples["sentence"]
    tags = examples["sentence_tagged"]
    texts = []
    for sentence,tag in zip(sentences,tags):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(sentence,tag) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("manupinasco/syntax_analysis")
dataset_train = dataset["train"].map(formatting_prompts_func, batched = True,)
dataset_test = dataset["test"]

"""**Prueba pre-entrenamiento**"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
for sentence in dataset_test["sentence"][:5]:
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          "Below you have a sentence in quotation marks. Provide the syntactic category of each word in the context of the sentence.", # instruction
          f' "{sentence}" ', # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  response = str(tokenizer.batch_decode(outputs))
  response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip()
  print("INPUT: "+sentence+"/// RESPONSE: "+response)

"""**Testeo pre-entrenamiento**"""

total = len(dataset_train["sentence"])
correct = 0
i=0
for sentence in dataset_test["sentence"]:
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          "Below you have a sentence in quotation marks. Provide the syntactic category of each word in the context of the sentence.", # instruction
          f' "{sentence}" ', # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  response = str(tokenizer.batch_decode(outputs))
  response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip()
  if response.lower()==dataset_test["sentence_tagged"][i].lower():
    correct+=1
  print("RESPONSE: "+response)
  print("CORRECT_RESPONSE: "+dataset_test["sentence_tagged"][i])
  print("CORRECT RESPONSES SO FAR: "+correct)
  print("NUMBER OF SENTENCE: "+i)
  i+=1

print("CORRECT "+correct+" OUT OF "+total+". PERCENTAGE "+(correct/total)*100)

"""**Entrenamiento del modelo**



*   *Epoch*: cantidad de veces que recorre el dataset completo
*   *Batch*: cantidad de subgrupos en los que divide al dataset.
*   Entrenamiento común: cada vez que se recorre un batch, se updatean los weights.
*   Entrenamiento por **gradient accumulation**: para casos donde se cuente con poca memoria. Sirve para ir acumulando el gradiente de las distintas partes del batch de forma tal de no computar el gradiente recién al finalizarlo.

*   En el caso de gradient accumulation, el batch size es = batch size per device x gradient accumulation steps.

*   *Batch size*: partes en las que realmente dividí al conjunto de datos.

*   *Batch size per device*: partes en las que dividí al conjunto de datos para, al terminar de recorrer cada una de estas partes, hacer un update de los weights.

*   *Gradient accumulation steps*: veces que, por cada batch size per device, acumulé las gradientes previo al update de los weights.










"""

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Puede hacer el entrenamiento 5x más rápido para oraciones breves.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Si se setea a 1 hace una corrida completa por todo el dataset.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

"""# **TESTEO DEL MODELO**

**Prueba post-entrenamiento**
"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
for sentence in dataset_test["sentence"][:5]:
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          f' "{sentence}" ', # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  response = str(tokenizer.batch_decode(outputs))
  response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip()
  print("INPUT: "+sentence+"/// RESPONSE: "+response)

"""**Testeo post-entrenamiento**"""

total = len(dataset_train["sentence"])
correct = 0
i=0
for sentence in dataset_test["sentence"]:
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          f' "{sentence}" ', # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  response = str(tokenizer.batch_decode(outputs))
  response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip()
  if response.lower()==dataset_test["sentence_tagged"][i].lower():
    correct+=1
  print("RESPONSE: "+response)
  print("CORRECT_RESPONSE: "+dataset_test["sentence_tagged"][i])
  print("CORRECT RESPONSES SO FAR: "+correct)
  print("NUMBER OF SENTENCE: "+i)
  i+=1

print("CORRECT "+correct+" OUT OF "+total+". PERCENTAGE "+(correct/total)*100)