zhaozitian commited on
Commit
1178b35
β€’
1 Parent(s): 7f70f4d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from peft import PeftModel
3
+ import transformers
4
+ import gradio as gr
5
+
6
+ assert (
7
+ "LlamaTokenizer" in transformers._import_structure["models.llama"]
8
+ ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
9
+ from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
10
+
11
+ tokenizer = LlamaTokenizer.from_pretrained("daryl149/llama-2-13b-chat-hf")
12
+
13
+ BASE_MODEL = "daryl149/llama-2-13b-chat-hf"
14
+ LORA_WEIGHTS = "Sparticle/llama-2-13b-chat-japanese-lora"
15
+
16
+ if torch.cuda.is_available():
17
+ device = "cuda"
18
+ else:
19
+ device = "cpu"
20
+
21
+ try:
22
+ if torch.backends.mps.is_available():
23
+ device = "mps"
24
+ except:
25
+ pass
26
+
27
+ if device == "cuda":
28
+ model = LlamaForCausalLM.from_pretrained(
29
+ BASE_MODEL,
30
+ load_in_8bit=False,
31
+ torch_dtype=torch.float16,
32
+ device_map="auto",
33
+ )
34
+ model = PeftModel.from_pretrained(
35
+ model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
36
+ )
37
+ elif device == "mps":
38
+ model = LlamaForCausalLM.from_pretrained(
39
+ BASE_MODEL,
40
+ device_map={"": device},
41
+ torch_dtype=torch.float16,
42
+ )
43
+ model = PeftModel.from_pretrained(
44
+ model,
45
+ LORA_WEIGHTS,
46
+ device_map={"": device},
47
+ torch_dtype=torch.float16,
48
+ )
49
+ else:
50
+ model = LlamaForCausalLM.from_pretrained(
51
+ BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
52
+ )
53
+ model = PeftModel.from_pretrained(
54
+ model,
55
+ LORA_WEIGHTS,
56
+ device_map={"": device},
57
+ )
58
+
59
+
60
+ def generate_prompt(instruction, input=None):
61
+ if input:
62
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
63
+ ### Instruction:
64
+ {instruction}
65
+ ### Input:
66
+ {input}
67
+ ### Response:"""
68
+ else:
69
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
70
+ ### Instruction:
71
+ {instruction}
72
+ ### Response:"""
73
+
74
+ if device != "cpu":
75
+ model.half()
76
+ model.eval()
77
+ if torch.__version__ >= "2":
78
+ model = torch.compile(model)
79
+
80
+
81
+ def evaluate(
82
+ instruction,
83
+ input=None,
84
+ temperature=0.1,
85
+ top_p=0.75,
86
+ top_k=40,
87
+ num_beams=4,
88
+ max_new_tokens=128,
89
+ **kwargs,
90
+ ):
91
+ prompt = generate_prompt(instruction, input)
92
+ inputs = tokenizer(prompt, return_tensors="pt")
93
+ input_ids = inputs["input_ids"].to(device)
94
+ generation_config = GenerationConfig(
95
+ temperature=temperature,
96
+ top_p=top_p,
97
+ top_k=top_k,
98
+ num_beams=num_beams,
99
+ **kwargs,
100
+ )
101
+ with torch.no_grad():
102
+ generation_output = model.generate(
103
+ input_ids=input_ids,
104
+ generation_config=generation_config,
105
+ return_dict_in_generate=True,
106
+ output_scores=True,
107
+ max_new_tokens=max_new_tokens,
108
+ )
109
+ s = generation_output.sequences[0]
110
+ output = tokenizer.decode(s)
111
+ return output.split("### Response:")[1].strip()
112
+
113
+
114
+ g = gr.Interface(
115
+ fn=evaluate,
116
+ inputs=[
117
+ gr.components.Textbox(
118
+ lines=2, label="Instruction", placeholder="Tell me about alpacas."
119
+ ),
120
+ gr.components.Textbox(lines=2, label="Input", placeholder="none"),
121
+ gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
122
+ gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
123
+ gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
124
+ gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
125
+ gr.components.Slider(
126
+ minimum=1, maximum=512, step=1, value=128, label="Max tokens"
127
+ ),
128
+ ],
129
+ outputs=[
130
+ gr.inputs.Textbox(
131
+ lines=5,
132
+ label="Output",
133
+ )
134
+ ],
135
+ title="πŸ¦™πŸŒ² Alpaca-LoRA",
136
+ description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",
137
+ )
138
+ g.queue(concurrency_count=1)
139
+ g.launch()