mamkkl commited on
Commit
eda9136
·
verified ·
1 Parent(s): 3548597

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -101
app.py CHANGED
@@ -30,24 +30,7 @@ lora_weights = "./"
30
  #)
31
 
32
  cache_dir = "/data"
33
- from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
34
- replace_llama_rope_with_scaled_rope()
35
- model = transformers.AutoModelForCausalLM.from_pretrained(
36
- base_model,
37
- torch_dtype=torch.float16,
38
- cache_dir=cache_dir,
39
- device_map="auto",
40
- )
41
 
42
- model = PeftModel.from_pretrained(
43
- model,
44
- lora_weights,
45
- device_map="auto",
46
- cache_dir=cache_dir,
47
- torch_dtype=torch.float16,
48
- )
49
- tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
50
- tokenizer.pad_token = tokenizer.unk_token
51
  PROMPT_DICT = {
52
  "prompt_input": (
53
  "Below is an instruction that describes a task, paired with further context. "
@@ -61,91 +44,107 @@ PROMPT_DICT = {
61
  ),
62
  }
63
 
64
- def generate_prompt(instruction, input=None):
65
- if input:
66
- return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
67
- else:
68
- return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
69
-
70
- def respond(
71
- message,
72
- history: list[tuple[str, str]],
73
- system_message,
74
- max_tokens,
75
- temperature,
76
- top_p,
77
- ):
78
- ins_f = generate_prompt(instruction,input)
79
- inputs = tokenizer(ins_f, return_tensors="pt")
80
- input_ids = inputs["input_ids"].cuda()
81
- generation_config = GenerationConfig(
82
- temperature=0.1,
83
- top_p=0.75,
84
- top_k=40,
85
- do_sample=True,
86
- num_beams=1,
87
- max_new_tokens = 512
88
- )
89
-
90
- # Without streaming
91
- with torch.no_grad():
92
- generation_output = model.generate(
93
- input_ids=input_ids,
94
- generation_config=generation_config,
95
- return_dict_in_generate=True,
96
- output_scores=False,
97
- max_new_tokens=max_new_tokens,
98
  )
99
- s = generation_output.sequences[0]
100
- output = tokenizer.decode(s)
101
- response = output.split("Response:")[1].strip()
102
- yield response
103
 
104
- #messages = [{"role": "system", "content": system_message}]
105
-
106
- #for val in history:
107
- # if val[0]:
108
- # messages.append({"role": "user", "content": val[0]})
109
- # if val[1]:
110
- # messages.append({"role": "assistant", "content": val[1]})
111
-
112
- # messages.append({"role": "user", "content": message})
113
-
114
- #response = ""
115
-
116
- #for message in client.chat_completion(
117
- # messages,
118
- # max_tokens=max_tokens,
119
- # stream=True,
120
- # temperature=temperature,
121
- # top_p=top_p,
122
- #):
123
- # token = message.choices[0].delta.content
124
-
125
- # response += token
126
- # yield response
127
-
128
-
129
- """
130
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
131
- """
132
- demo = gr.ChatInterface(
133
- respond,
134
- additional_inputs=[
135
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
136
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
137
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
138
- gr.Slider(
139
- minimum=0.1,
140
- maximum=1.0,
141
- value=0.95,
142
- step=0.05,
143
- label="Top-p (nucleus sampling)",
144
- ),
145
- ],
146
- )
147
-
148
-
149
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  model.eval()
151
  demo.launch()
 
30
  #)
31
 
32
  cache_dir = "/data"
 
 
 
 
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
34
  PROMPT_DICT = {
35
  "prompt_input": (
36
  "Below is an instruction that describes a task, paired with further context. "
 
44
  ),
45
  }
46
 
47
+ if __name__ == "__main__":
48
+ from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
49
+ replace_llama_rope_with_scaled_rope()
50
+ model = transformers.AutoModelForCausalLM.from_pretrained(
51
+ base_model,
52
+ torch_dtype=torch.float16,
53
+ cache_dir=cache_dir,
54
+ device_map="auto",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  )
 
 
 
 
56
 
57
+ model = PeftModel.from_pretrained(
58
+ model,
59
+ lora_weights,
60
+ device_map="auto",
61
+ cache_dir=cache_dir,
62
+ torch_dtype=torch.float16,
63
+ )
64
+ tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
65
+ tokenizer.pad_token = tokenizer.unk_token
66
+ def generate_prompt(instruction, input=None):
67
+ if input:
68
+ return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
69
+ else:
70
+ return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
71
+
72
+ def respond(
73
+ message,
74
+ history: list[tuple[str, str]],
75
+ system_message,
76
+ max_tokens,
77
+ temperature,
78
+ top_p,
79
+ ):
80
+ ins_f = generate_prompt(instruction,input)
81
+ inputs = tokenizer(ins_f, return_tensors="pt")
82
+ input_ids = inputs["input_ids"].cuda()
83
+ generation_config = GenerationConfig(
84
+ temperature=0.1,
85
+ top_p=0.75,
86
+ top_k=40,
87
+ do_sample=True,
88
+ num_beams=1,
89
+ max_new_tokens = 512
90
+ )
91
+
92
+ # Without streaming
93
+ with torch.no_grad():
94
+ generation_output = model.generate(
95
+ input_ids=input_ids,
96
+ generation_config=generation_config,
97
+ return_dict_in_generate=True,
98
+ output_scores=False,
99
+ max_new_tokens=max_new_tokens,
100
+ )
101
+ s = generation_output.sequences[0]
102
+ output = tokenizer.decode(s)
103
+ response = output.split("Response:")[1].strip()
104
+ yield response
105
+
106
+ #messages = [{"role": "system", "content": system_message}]
107
+
108
+ #for val in history:
109
+ # if val[0]:
110
+ # messages.append({"role": "user", "content": val[0]})
111
+ # if val[1]:
112
+ # messages.append({"role": "assistant", "content": val[1]})
113
+
114
+ # messages.append({"role": "user", "content": message})
115
+
116
+ #response = ""
117
+
118
+ #for message in client.chat_completion(
119
+ # messages,
120
+ # max_tokens=max_tokens,
121
+ # stream=True,
122
+ # temperature=temperature,
123
+ # top_p=top_p,
124
+ #):
125
+ # token = message.choices[0].delta.content
126
+
127
+ # response += token
128
+ # yield response
129
+
130
+
131
+ """
132
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
133
+ """
134
+ demo = gr.ChatInterface(
135
+ respond,
136
+ additional_inputs=[
137
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
138
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
139
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
140
+ gr.Slider(
141
+ minimum=0.1,
142
+ maximum=1.0,
143
+ value=0.95,
144
+ step=0.05,
145
+ label="Top-p (nucleus sampling)",
146
+ ),
147
+ ],
148
+ )
149
  model.eval()
150
  demo.launch()