demos / app.py
Gabriele Morello
Add inference and req
a001687
raw
history blame
1.17 kB
import gradio as gr
import spaces
import torch
from peft import PeftConfig, PeftModel
from transformers import LlamaForCausalLM, AutoTokenizer, BitsAndBytesConfig
config = PeftConfig.from_pretrained("GGmorello/FLAMES")
model = LlamaForCausalLM.from_pretrained(
config.base_model_name_or_path,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
),
)
model = PeftModel.from_pretrained(model, "GGmorello/FLAMES")
MAX_SEQ_LEN = 4096
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model.config.pad_token = tokenizer.pad_token = tokenizer.unk_token
zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' πŸ€”
@spaces.GPU
def predict(text):
input_ids = tokenizer(text, return_tensors='pt')["input_ids"]
generated_ids = model.generate(input_ids, max_new_tokens=256)
filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
return filling
demo = gr.Interface(fn=predict, inputs=gr.Text(), outputs=gr.Text())
demo.launch()