|
|
|
"""Training Llama 3.1.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/19LthnXISqvXgzE-1S2crf-PtTv3OaRmo |
|
|
|
# **TRAINING DEL MODELO** |
|
|
|
**Instalación de dependencias** |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from unsloth import FastLanguageModel |
|
import torch |
|
max_seq_length = 2048 |
|
dtype = None |
|
load_in_4bit = True |
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct", |
|
max_seq_length = max_seq_length, |
|
dtype = dtype, |
|
load_in_4bit = load_in_4bit, |
|
) |
|
|
|
"""**Definición de los Lora Adapters**""" |
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r = 16, |
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj",], |
|
lora_alpha = 16, |
|
lora_dropout = 0, |
|
bias = "none", |
|
|
|
use_gradient_checkpointing = "unsloth", |
|
random_state = 3407, |
|
use_rslora = False, |
|
loftq_config = None, |
|
) |
|
|
|
"""**Preparación del dataset**""" |
|
|
|
alpaca_prompt = """Below is an instruction that describes a task, with an input that gives more context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
Below you have a sentence in quotation marks. Provide the syntactic category of each word in the context of the sentence. |
|
|
|
### Sentence: |
|
"{}" |
|
|
|
### Response: |
|
{}""" |
|
|
|
EOS_TOKEN = tokenizer.eos_token |
|
def formatting_prompts_func(examples): |
|
length= len(examples["sentence"]) |
|
sentences = examples["sentence"] |
|
tags = examples["sentence_tagged"] |
|
texts = [] |
|
for sentence,tag in zip(sentences,tags): |
|
|
|
text = alpaca_prompt.format(sentence,tag) + EOS_TOKEN |
|
texts.append(text) |
|
return { "text" : texts, } |
|
pass |
|
|
|
from datasets import load_dataset |
|
dataset = load_dataset("manupinasco/syntax_analysis") |
|
dataset_train = dataset["train"].map(formatting_prompts_func, batched = True,) |
|
dataset_test = dataset["test"] |
|
|
|
"""**Prueba pre-entrenamiento**""" |
|
|
|
FastLanguageModel.for_inference(model) |
|
for sentence in dataset_test["sentence"][:5]: |
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
"Below you have a sentence in quotation marks. Provide the syntactic category of each word in the context of the sentence.", |
|
f' "{sentence}" ', |
|
"", |
|
) |
|
], return_tensors = "pt").to("cuda") |
|
|
|
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True) |
|
response = str(tokenizer.batch_decode(outputs)) |
|
response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip() |
|
print("INPUT: "+sentence+"/// RESPONSE: "+response) |
|
|
|
"""**Testeo pre-entrenamiento**""" |
|
|
|
total = len(dataset_train["sentence"]) |
|
correct = 0 |
|
i=0 |
|
for sentence in dataset_test["sentence"]: |
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
"Below you have a sentence in quotation marks. Provide the syntactic category of each word in the context of the sentence.", |
|
f' "{sentence}" ', |
|
"", |
|
) |
|
], return_tensors = "pt").to("cuda") |
|
|
|
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True) |
|
response = str(tokenizer.batch_decode(outputs)) |
|
response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip() |
|
if response.lower()==dataset_test["sentence_tagged"][i].lower(): |
|
correct+=1 |
|
print("RESPONSE: "+response) |
|
print("CORRECT_RESPONSE: "+dataset_test["sentence_tagged"][i]) |
|
print("CORRECT RESPONSES SO FAR: "+correct) |
|
print("NUMBER OF SENTENCE: "+i) |
|
i+=1 |
|
|
|
print("CORRECT "+correct+" OUT OF "+total+". PERCENTAGE "+(correct/total)*100) |
|
|
|
"""**Entrenamiento del modelo** |
|
|
|
|
|
|
|
* *Epoch*: cantidad de veces que recorre el dataset completo |
|
* *Batch*: cantidad de subgrupos en los que divide al dataset. |
|
* Entrenamiento común: cada vez que se recorre un batch, se updatean los weights. |
|
* Entrenamiento por **gradient accumulation**: para casos donde se cuente con poca memoria. Sirve para ir acumulando el gradiente de las distintas partes del batch de forma tal de no computar el gradiente recién al finalizarlo. |
|
|
|
* En el caso de gradient accumulation, el batch size es = batch size per device x gradient accumulation steps. |
|
|
|
* *Batch size*: partes en las que realmente dividí al conjunto de datos. |
|
|
|
* *Batch size per device*: partes en las que dividí al conjunto de datos para, al terminar de recorrer cada una de estas partes, hacer un update de los weights. |
|
|
|
* *Gradient accumulation steps*: veces que, por cada batch size per device, acumulé las gradientes previo al update de los weights. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
|
from trl import SFTTrainer |
|
from transformers import TrainingArguments |
|
from unsloth import is_bfloat16_supported |
|
|
|
trainer = SFTTrainer( |
|
model = model, |
|
tokenizer = tokenizer, |
|
train_dataset = dataset_train, |
|
dataset_text_field = "text", |
|
max_seq_length = max_seq_length, |
|
dataset_num_proc = 2, |
|
packing = False, |
|
args = TrainingArguments( |
|
per_device_train_batch_size = 2, |
|
gradient_accumulation_steps = 4, |
|
warmup_steps = 5, |
|
|
|
max_steps = 60, |
|
learning_rate = 2e-4, |
|
fp16 = not is_bfloat16_supported(), |
|
bf16 = is_bfloat16_supported(), |
|
logging_steps = 1, |
|
optim = "adamw_8bit", |
|
weight_decay = 0.01, |
|
lr_scheduler_type = "linear", |
|
seed = 3407, |
|
output_dir = "outputs", |
|
), |
|
) |
|
|
|
trainer_stats = trainer.train() |
|
|
|
"""# **TESTEO DEL MODELO** |
|
|
|
**Prueba post-entrenamiento** |
|
""" |
|
|
|
FastLanguageModel.for_inference(model) |
|
for sentence in dataset_test["sentence"][:5]: |
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
f' "{sentence}" ', |
|
"", |
|
) |
|
], return_tensors = "pt").to("cuda") |
|
|
|
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True) |
|
response = str(tokenizer.batch_decode(outputs)) |
|
response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip() |
|
print("INPUT: "+sentence+"/// RESPONSE: "+response) |
|
|
|
"""**Testeo post-entrenamiento**""" |
|
|
|
total = len(dataset_train["sentence"]) |
|
correct = 0 |
|
i=0 |
|
for sentence in dataset_test["sentence"]: |
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
f' "{sentence}" ', |
|
"", |
|
) |
|
], return_tensors = "pt").to("cuda") |
|
|
|
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True) |
|
response = str(tokenizer.batch_decode(outputs)) |
|
response=response.split("Response:")[1].replace("']", "").replace("\\n", "").replace("<|eot_id|>", "").lstrip() |
|
if response.lower()==dataset_test["sentence_tagged"][i].lower(): |
|
correct+=1 |
|
print("RESPONSE: "+response) |
|
print("CORRECT_RESPONSE: "+dataset_test["sentence_tagged"][i]) |
|
print("CORRECT RESPONSES SO FAR: "+correct) |
|
print("NUMBER OF SENTENCE: "+i) |
|
i+=1 |
|
|
|
print("CORRECT "+correct+" OUT OF "+total+". PERCENTAGE "+(correct/total)*100) |