File size: 1,984 Bytes
849a8db
e6ae614
 
 
849a8db
e6ae614
 
849a8db
e6ae614
d625244
e6ae614
 
 
 
 
 
 
d625244
e6ae614
 
 
d625244
e6ae614
 
d625244
e6ae614
 
faa48c9
e6ae614
 
 
c456ddf
d625244
e6ae614
 
 
 
 
 
 
 
b282f54
d625244
e6ae614
 
 
d625244
e6ae614
d625244
 
 
 
 
 
 
 
 
e6ae614
 
 
d625244
e6ae614
 
d625244
e6ae614
d625244
40e75ca
 
 
1f2856e
40e75ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import random
from typing import Optional
from fastapi import FastAPI
from pydantic import BaseModel

from peft import PeftModel
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig

app = FastAPI()

tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")
model = LLaMAForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)
model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b")

class InputPrompt(BaseModel):
    instruction: str
    input: Optional[str] = None

class OutputResponse(BaseModel):
    response: str

@app.post("/evaluate")
def evaluate(input_prompt: InputPrompt):
    temperature = 0.9
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=0.75,
        num_beams=1, do_sample=True
    )
    prompt = generate_prompt(input_prompt.instruction, input_prompt.input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        return OutputResponse(response=output.split("### Response:")[1].strip())

def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)