Spaces:
Runtime error
Runtime error
File size: 5,315 Bytes
4f53cb7 25d0dfd 4f53cb7 3fb0314 58cf8e5 3720052 1dc8523 58cf8e5 4f53cb7 3fb0314 844f39e 4f53cb7 a9e602c addada5 3fb0314 418d2a3 3fb0314 4f53cb7 1dc8523 addada5 1dc8523 4f53cb7 1b1fde8 140d623 4f53cb7 140d623 4f53cb7 1dc8523 4f53cb7 addada5 140d623 bb02454 140d623 4f53cb7 1b1fde8 4f53cb7 3720052 1dc8523 4f53cb7 3720052 1dc8523 140d623 4f53cb7 20ec549 4f53cb7 140d623 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# !python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer, StoppingCriteria, StoppingCriteriaList, GenerationConfig
import os
#sft_model = "somosnlp/gemma-FULL-RAC-Colombia_v2"
#sft_model = "somosnlp/RecetasDeLaAbuela_mistral-7b-instruct-v0.2-bnb-4bit"
#base_model_name = "unsloth/Mistral-7B-Instruct-v0.2"
sft_model1 = "somosnlp/RecetasDeLaAbuela_gemma-2b-it-bnb-4bit"
sft_model2 = "somosnlp/RecetasDeLaAbuela_mistral-7b-instruct-v0.2-bnb-4bit"
base_model_name = "unsloth/gemma-2b-it-bnb-4bit"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
max_seq_length=400
# if torch.cuda.get_device_capability()[0] >= 8:
# # print("Flash Attention")
# attn_implementation="flash_attention_2"
# else:
# attn_implementation=None
attn_implementation=None
#base_model = AutoModelForCausalLM.from_pretrained(model_name,return_dict=True,torch_dtype=torch.float16,)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name,return_dict=True,device_map="auto", torch_dtype=torch.float16,)
#base_model = AutoModelForCausalLM.from_pretrained(base_model_name, return_dict=True, device_map = {"":0}, attn_implementation = attn_implementation,).eval()
tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length = max_seq_length)
sft_model = sft_model1
ft_model = PeftModel.from_pretrained(base_model, sft_model)
model = ft_model.merge_and_unload()
model.save_pretrained(".")
#model.to('cuda')
tokenizer.save_pretrained(".")
class ListOfTokensStoppingCriteria(StoppingCriteria):
"""
Clase para definir un criterio de parada basado en una lista de tokens específicos.
"""
def __init__(self, tokenizer, stop_tokens):
self.tokenizer = tokenizer
# Codifica cada token de parada y guarda sus IDs en una lista
self.stop_token_ids_list = [tokenizer.encode(stop_token, add_special_tokens=False) for stop_token in stop_tokens]
def __call__(self, input_ids, scores, **kwargs):
# Verifica si los últimos tokens generados coinciden con alguno de los conjuntos de tokens de parada
for stop_token_ids in self.stop_token_ids_list:
len_stop_tokens = len(stop_token_ids)
if len(input_ids[0]) >= len_stop_tokens:
if input_ids[0, -len_stop_tokens:].tolist() == stop_token_ids:
return True
return False
# Uso del criterio de parada personalizado
stop_tokens = ["<end_of_turn>"] # Lista de tokens de parada
# Inicializa tu criterio de parada con el tokenizer y la lista de tokens de parada
stopping_criteria = ListOfTokensStoppingCriteria(tokenizer, stop_tokens)
# Añade tu criterio de parada a una StoppingCriteriaList
stopping_criteria_list = StoppingCriteriaList([stopping_criteria])
def generate_text(modelin, prompt, context, max_length=2100):
print('Modelo es: '+modelin)
if (modelin != sft_model):
sft_model = modelin
ft_model = PeftModel.from_pretrained(base_model, sft_model)
model = ft_model.merge_and_unload()
prompt=prompt.replace("\n", "").replace("¿","").replace("?","")
input_text = f'''<bos><start_of_turn>system ¿{context}?<end_of_turn><start_of_turn>user ¿{prompt}?<end_of_turn><start_of_turn>model'''
inputs = tokenizer.encode(input_text, return_tensors="pt", add_special_tokens=False).to("cuda:0")
max_new_tokens=max_length
generation_config = GenerationConfig(
max_new_tokens=max_new_tokens,
temperature=0.32,
#top_p=0.9,
top_k=50, # 45
repetition_penalty=1.04, #1.1
do_sample=True,
)
outputs = model.generate(generation_config=generation_config, input_ids=inputs, stopping_criteria=stopping_criteria_list,)
return tokenizer.decode(outputs[0], skip_special_tokens=False) #True
def mostrar_respuesta(modelo, pregunta, contexto):
try:
print('Modelo: '+str(modelo))
print('Pregunta: '+str(pregunta))
print('Contexto: '+str(contexto))
res= generate_text(modelo, pregunta, contexto, max_length=500)
print('Respuesta: '+str(contexto))
return str(res)
except Exception as e:
return str(e)
# Ejemplos de preguntas
mis_ejemplos = [
["¿Dime la receta de la tortilla de patatatas?"],
["¿Dime la receta del ceviche?"],
["¿Como se cocinan unos autenticos frijoles?"],
]
lista_modelos = [sft_model1, sft_model2]
iface = gr.Interface(
fn=mostrar_respuesta,
inputs=[gr.Dropdown(choices=lista_modelos, value = sft_model1, label="Modelo", type="value"),
gr.Textbox(label="Pregunta"),
gr.Textbox(label="Contexto", value="You are a helpful AI assistant. Eres un experto cocinero hispanoamericano."),],
outputs=[gr.Textbox(label="Respuesta", lines=2),],
title="Recetas de la Abuel@",
description="Introduce tu pregunta sobre recetas de cocina.",
examples=mis_ejemplos,
)
iface.queue(max_size=14).launch() # share=True,debug=True |