Spaces:
Runtime error
Runtime error
import torch | |
from peft import PeftModel | |
import transformers | |
import gradio as gr | |
from fastapi import FastAPI | |
import random | |
app= FastAPI() | |
assert ( | |
"LlamaTokenizer" in transformers._import_structure["models.llama"] | |
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git" | |
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig | |
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf") | |
BASE_MODEL = "decapoda-research/llama-7b-hf" | |
LORA_WEIGHTS = "tloen/alpaca-lora-7b" | |
if torch.cuda.is_available(): | |
device = "cuda" | |
else: | |
device = "cpu" | |
try: | |
if torch.backends.mps.is_available(): | |
device = "mps" | |
except: | |
pass | |
if device == "cuda": | |
model = LlamaForCausalLM.from_pretrained( | |
BASE_MODEL, | |
load_in_8bit=False, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
model = PeftModel.from_pretrained( | |
model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True | |
) | |
elif device == "mps": | |
model = LlamaForCausalLM.from_pretrained( | |
BASE_MODEL, | |
device_map={"": device}, | |
torch_dtype=torch.float16, | |
) | |
model = PeftModel.from_pretrained( | |
model, | |
LORA_WEIGHTS, | |
device_map={"": device}, | |
torch_dtype=torch.float16, | |
) | |
else: | |
model = LlamaForCausalLM.from_pretrained( | |
BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True | |
) | |
model = PeftModel.from_pretrained( | |
model, | |
LORA_WEIGHTS, | |
device_map={"": device}, | |
) | |
def generate_prompt(input=None): | |
instruction= '''You are a dating bio writer for single boy with the keywords provided. the dating bio should be within 30 words and should be catchy. the dating bio should be different in every run.''' | |
if input: | |
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Input: | |
{input} | |
# ### Response:""" | |
if device != "cpu": | |
model.half() | |
model.eval() | |
if torch.__version__ >= "2": | |
model = torch.compile(model) | |
async def evaluate( | |
input:str, | |
temperature=[0.2, 0.5, 0.7, 0.9, 1.0], | |
top_p=0.75, | |
top_k=40, | |
num_beams=4, | |
max_new_tokens=128, | |
seed=None, | |
do_sample=True, | |
**kwargs, | |
): | |
prompt = generate_prompt(input) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(device) | |
generation_config = GenerationConfig( | |
temperature=random.choice(temperature), | |
top_p=top_p, | |
top_k=top_k, | |
num_beams=num_beams, | |
**kwargs, | |
) | |
with torch.no_grad(): | |
generation_output = model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=max_new_tokens, | |
seed=None, | |
do_sample= do_sample | |
) | |
s = generation_output.sequences[0] | |
output = tokenizer.decode(s) | |
return output.split("### Response:")[1].strip() | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |