alpaca-lora

Runtime error

App Files Files Community

sagu7 commited on Apr 9, 2023

Commit

e6ae614

1 Parent(s): edd70bb

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -100

app.py CHANGED Viewed

@@ -1,69 +1,51 @@
-import torch
-from peft import PeftModel
-import transformers
-import gradio as gr
-from fastapi import FastAPI
 import random
-app= FastAPI()
-assert (
-    "LlamaTokenizer" in transformers._import_structure["models.llama"]
-), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
-from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
-tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
-BASE_MODEL = "decapoda-research/llama-7b-hf"
-LORA_WEIGHTS = "tloen/alpaca-lora-7b"
-if torch.cuda.is_available():
-    device = "cuda"
-else:
-    device = "cpu"
-try:
-    if torch.backends.mps.is_available():
-        device = "mps"
-except:
-    pass
-if device == "cuda":
-    model = LlamaForCausalLM.from_pretrained(
-        BASE_MODEL,
-        load_in_8bit=False,
-        torch_dtype=torch.float16,
-        device_map="auto",
-    )
-    model = PeftModel.from_pretrained(
-        model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
-    )
-elif device == "mps":
-    model = LlamaForCausalLM.from_pretrained(
-        BASE_MODEL,
-        device_map={"": device},
-        torch_dtype=torch.float16,
-    )
-    model = PeftModel.from_pretrained(
-        model,
-        LORA_WEIGHTS,
-        device_map={"": device},
-        torch_dtype=torch.float16,
-    )
-else:
-    model = LlamaForCausalLM.from_pretrained(
-        BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
     )
-    model = PeftModel.from_pretrained(
-        model,
-        LORA_WEIGHTS,
-        device_map={"": device},
     )
-def generate_prompt(input:str):
-    instruction= '''You are a dating bio writer for single boy with the keywords provided. the dating bio should be within 30 words and should be catchy. the dating bio should be different in every run.'''
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
@@ -73,50 +55,12 @@ def generate_prompt(input:str):
 ### Input:
 {input}
-# ### Response:"""
-if device != "cpu":
-    model.half()
-model.eval()
-if torch.__version__ >= "2":
-    model = torch.compile(model)
-@app.post("/generate_bio")
-async def evaluate(
-    input:str,
-    top_p=0.75,
-    top_k=40,
-    num_beams=4,
-    max_new_tokens=128,
-    seed=None,
-    do_sample=True,
-    # **kwargs,
-):
-    prompt = generate_prompt(input)
-    inputs = tokenizer(prompt, return_tensors="pt")
-    input_ids = inputs["input_ids"].to(device)
-    temperature= [0.2, 0.5, 0.7, 0.9, 1.0]
-    generation_config = GenerationConfig(
-        temperature=random.choice(temperature),
-        top_p=top_p,
-        top_k=top_k,
-        num_beams=num_beams,
-        **kwargs,
-    )
-    with torch.no_grad():
-        generation_output = model.generate(
-            input_ids=input_ids,
-            generation_config=generation_config,
-            return_dict_in_generate=True,
-            output_scores=True,
-            max_new_tokens=max_new_tokens,
-            seed=None,
-            do_sample= do_sample
-        )
-    s = generation_output.sequences[0]
-    output = tokenizer.decode(s)
-    return output.split("### Response:")[1].strip()
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import random
+from typing import Optional
+from fastapi import FastAPI
+from pydantic import BaseModel
+from peft import PeftModel
+from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig
+app = FastAPI()
+tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")
+model = LLaMAForCausalLM.from_pretrained(
+    "decapoda-research/llama-7b-hf",
+    load_in_8bit=True,
+    device_map="auto",
+)
+model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b")
+class InputPrompt(BaseModel):
+    instruction: str
+    input: Optional[str] = None
+class OutputResponse(BaseModel):
+    response: str
+@app.post("/evaluate")
+def evaluate(input_prompt: InputPrompt):
+    temperature = random.uniform(0.1, 1.0)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=0.75,
+        num_beams=4,
     )
+    prompt = generate_prompt(input_prompt.instruction, input_prompt.input)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].cuda()
+    generation_output = model.generate(
+        input_ids=input_ids,
+        generation_config=generation_config,
+        return_dict_in_generate=True,
+        output_scores=True,
+        max_new_tokens=256
     )
+    for s in generation_output.sequences:
+        output = tokenizer.decode(s)
+        return OutputResponse(response=output.split("### Response:")[1].strip())
+def generate_prompt(instruction, input=None):
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Input:
 {input}
+### Response:"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+{instruction}
+### Response:"""