alpaca-lora

Runtime error

App Files Files Community

alpaca-lora / app.py

sagu7

Update app.py

849a8db over 2 years ago

raw

history blame

3.36 kB

	import torch
	from peft import PeftModel
	import transformers
	import gradio as gr
	from fastapi import FastAPI
	import random


	app= FastAPI()

	assert (
	"LlamaTokenizer" in transformers._import_structure["models.llama"]
	), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
	from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

	tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")

	BASE_MODEL = "decapoda-research/llama-7b-hf"
	LORA_WEIGHTS = "tloen/alpaca-lora-7b"

	if torch.cuda.is_available():
	device = "cuda"
	else:
	device = "cpu"

	try:
	if torch.backends.mps.is_available():
	device = "mps"
	except:
	pass

	if device == "cuda":
	model = LlamaForCausalLM.from_pretrained(
	BASE_MODEL,
	load_in_8bit=False,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	model = PeftModel.from_pretrained(
	model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
	)
	elif device == "mps":
	model = LlamaForCausalLM.from_pretrained(
	BASE_MODEL,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	model = PeftModel.from_pretrained(
	model,
	LORA_WEIGHTS,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	else:
	model = LlamaForCausalLM.from_pretrained(
	BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
	)
	model = PeftModel.from_pretrained(
	model,
	LORA_WEIGHTS,
	device_map={"": device},
	)


	def generate_prompt(input=None):
	instruction= '''You are a dating bio writer for single boy with the keywords provided. the dating bio should be within 30 words and should be catchy. the dating bio should be different in every run.'''
	if input:
	return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{instruction}

	### Input:
	{input}

	# ### Response:"""

	if device != "cpu":
	model.half()
	model.eval()
	if torch.__version__ >= "2":
	model = torch.compile(model)

	@app.post("/generate_bio")
	async def evaluate(
	input:str,
	temperature=[0.2, 0.5, 0.7, 0.9, 1.0],
	top_p=0.75,
	top_k=40,
	num_beams=4,
	max_new_tokens=128,
	seed=None,
	do_sample=True,
	**kwargs,
	):
	prompt = generate_prompt(input)
	inputs = tokenizer(prompt, return_tensors="pt")
	input_ids = inputs["input_ids"].to(device)
	generation_config = GenerationConfig(
	temperature=random.choice(temperature),
	top_p=top_p,
	top_k=top_k,
	num_beams=num_beams,
	**kwargs,
	)
	with torch.no_grad():
	generation_output = model.generate(
	input_ids=input_ids,
	generation_config=generation_config,
	return_dict_in_generate=True,
	output_scores=True,
	max_new_tokens=max_new_tokens,
	seed=None,
	do_sample= do_sample
	)
	s = generation_output.sequences[0]
	output = tokenizer.decode(s)
	return output.split("### Response:")[1].strip()

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)