alpaca-lora / app.py
sagu7's picture
Update app.py
849a8db
raw
history blame
3.36 kB
import torch
from peft import PeftModel
import transformers
import gradio as gr
from fastapi import FastAPI
import random
app= FastAPI()
assert (
"LlamaTokenizer" in transformers._import_structure["models.llama"]
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
BASE_MODEL = "decapoda-research/llama-7b-hf"
LORA_WEIGHTS = "tloen/alpaca-lora-7b"
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
try:
if torch.backends.mps.is_available():
device = "mps"
except:
pass
if device == "cuda":
model = LlamaForCausalLM.from_pretrained(
BASE_MODEL,
load_in_8bit=False,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(
model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
)
elif device == "mps":
model = LlamaForCausalLM.from_pretrained(
BASE_MODEL,
device_map={"": device},
torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(
model,
LORA_WEIGHTS,
device_map={"": device},
torch_dtype=torch.float16,
)
else:
model = LlamaForCausalLM.from_pretrained(
BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
)
model = PeftModel.from_pretrained(
model,
LORA_WEIGHTS,
device_map={"": device},
)
def generate_prompt(input=None):
instruction= '''You are a dating bio writer for single boy with the keywords provided. the dating bio should be within 30 words and should be catchy. the dating bio should be different in every run.'''
if input:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
# ### Response:"""
if device != "cpu":
model.half()
model.eval()
if torch.__version__ >= "2":
model = torch.compile(model)
@app.post("/generate_bio")
async def evaluate(
input:str,
temperature=[0.2, 0.5, 0.7, 0.9, 1.0],
top_p=0.75,
top_k=40,
num_beams=4,
max_new_tokens=128,
seed=None,
do_sample=True,
**kwargs,
):
prompt = generate_prompt(input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=random.choice(temperature),
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
**kwargs,
)
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
seed=None,
do_sample= do_sample
)
s = generation_output.sequences[0]
output = tokenizer.decode(s)
return output.split("### Response:")[1].strip()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)