mamkkl commited on
Commit
a5deebe
·
verified ·
1 Parent(s): eda9136

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -100
app.py CHANGED
@@ -4,6 +4,7 @@ import transformers
4
  from transformers import AutoTokenizer,GenerationConfig
5
  import torch
6
  from peft import PeftModel
 
7
 
8
  """
9
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
@@ -43,108 +44,108 @@ PROMPT_DICT = {
43
  "Instruction:\n{instruction}\n\nResponse:"
44
  ),
45
  }
 
 
 
 
 
 
 
 
46
 
47
- if __name__ == "__main__":
48
- from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
49
- replace_llama_rope_with_scaled_rope()
50
- model = transformers.AutoModelForCausalLM.from_pretrained(
51
- base_model,
52
- torch_dtype=torch.float16,
53
- cache_dir=cache_dir,
54
- device_map="auto",
55
- )
56
-
57
- model = PeftModel.from_pretrained(
58
- model,
59
- lora_weights,
60
- device_map="auto",
61
- cache_dir=cache_dir,
62
- torch_dtype=torch.float16,
63
- )
64
- tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
65
- tokenizer.pad_token = tokenizer.unk_token
66
- def generate_prompt(instruction, input=None):
67
- if input:
68
- return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
69
- else:
70
- return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
71
-
72
- def respond(
73
- message,
74
- history: list[tuple[str, str]],
75
- system_message,
76
- max_tokens,
77
- temperature,
78
- top_p,
79
- ):
80
- ins_f = generate_prompt(instruction,input)
81
- inputs = tokenizer(ins_f, return_tensors="pt")
82
- input_ids = inputs["input_ids"].cuda()
83
- generation_config = GenerationConfig(
84
- temperature=0.1,
85
- top_p=0.75,
86
- top_k=40,
87
- do_sample=True,
88
- num_beams=1,
89
- max_new_tokens = 512
 
90
  )
 
 
 
 
91
 
92
- # Without streaming
93
- with torch.no_grad():
94
- generation_output = model.generate(
95
- input_ids=input_ids,
96
- generation_config=generation_config,
97
- return_dict_in_generate=True,
98
- output_scores=False,
99
- max_new_tokens=max_new_tokens,
100
- )
101
- s = generation_output.sequences[0]
102
- output = tokenizer.decode(s)
103
- response = output.split("Response:")[1].strip()
104
- yield response
105
-
106
- #messages = [{"role": "system", "content": system_message}]
107
-
108
- #for val in history:
109
- # if val[0]:
110
- # messages.append({"role": "user", "content": val[0]})
111
- # if val[1]:
112
- # messages.append({"role": "assistant", "content": val[1]})
113
-
114
- # messages.append({"role": "user", "content": message})
115
-
116
- #response = ""
117
-
118
- #for message in client.chat_completion(
119
- # messages,
120
- # max_tokens=max_tokens,
121
- # stream=True,
122
- # temperature=temperature,
123
- # top_p=top_p,
124
- #):
125
- # token = message.choices[0].delta.content
126
-
127
- # response += token
128
- # yield response
129
-
130
-
131
- """
132
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
133
- """
134
- demo = gr.ChatInterface(
135
- respond,
136
- additional_inputs=[
137
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
138
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
139
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
140
- gr.Slider(
141
- minimum=0.1,
142
- maximum=1.0,
143
- value=0.95,
144
- step=0.05,
145
- label="Top-p (nucleus sampling)",
146
- ),
147
- ],
148
- )
149
  model.eval()
150
  demo.launch()
 
4
  from transformers import AutoTokenizer,GenerationConfig
5
  import torch
6
  from peft import PeftModel
7
+ import spaces
8
 
9
  """
10
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
44
  "Instruction:\n{instruction}\n\nResponse:"
45
  ),
46
  }
47
+ from llama_rope_scaled_monkey_patch import replace_llama_rope_with_scaled_rope
48
+ replace_llama_rope_with_scaled_rope()
49
+ model = transformers.AutoModelForCausalLM.from_pretrained(
50
+ base_model,
51
+ torch_dtype=torch.float16,
52
+ cache_dir=cache_dir,
53
+ device_map="auto",
54
+ )
55
 
56
+ model = PeftModel.from_pretrained(
57
+ model,
58
+ lora_weights,
59
+ device_map="auto",
60
+ cache_dir=cache_dir,
61
+ torch_dtype=torch.float16,
62
+ )
63
+ tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
64
+ tokenizer.pad_token = tokenizer.unk_token
65
+ def generate_prompt(instruction, input=None):
66
+ if input:
67
+ return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
68
+ else:
69
+ return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
70
+
71
+ @spaces.GPU
72
+ def respond(
73
+ message,
74
+ history: list[tuple[str, str]],
75
+ system_message,
76
+ max_tokens,
77
+ temperature,
78
+ top_p,
79
+ ):
80
+ ins_f = generate_prompt(instruction,input)
81
+ inputs = tokenizer(ins_f, return_tensors="pt")
82
+ input_ids = inputs["input_ids"].cuda()
83
+ generation_config = GenerationConfig(
84
+ temperature=0.1,
85
+ top_p=0.75,
86
+ top_k=40,
87
+ do_sample=True,
88
+ num_beams=1,
89
+ max_new_tokens = 512
90
+ )
91
+
92
+ # Without streaming
93
+ with torch.no_grad():
94
+ generation_output = model.generate(
95
+ input_ids=input_ids,
96
+ generation_config=generation_config,
97
+ return_dict_in_generate=True,
98
+ output_scores=False,
99
+ max_new_tokens=max_new_tokens,
100
  )
101
+ s = generation_output.sequences[0]
102
+ output = tokenizer.decode(s)
103
+ response = output.split("Response:")[1].strip()
104
+ yield response
105
 
106
+ #messages = [{"role": "system", "content": system_message}]
107
+
108
+ #for val in history:
109
+ # if val[0]:
110
+ # messages.append({"role": "user", "content": val[0]})
111
+ # if val[1]:
112
+ # messages.append({"role": "assistant", "content": val[1]})
113
+
114
+ # messages.append({"role": "user", "content": message})
115
+
116
+ #response = ""
117
+
118
+ #for message in client.chat_completion(
119
+ # messages,
120
+ # max_tokens=max_tokens,
121
+ # stream=True,
122
+ # temperature=temperature,
123
+ # top_p=top_p,
124
+ #):
125
+ # token = message.choices[0].delta.content
126
+
127
+ # response += token
128
+ # yield response
129
+
130
+
131
+ """
132
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
133
+ """
134
+ demo = gr.ChatInterface(
135
+ respond,
136
+ additional_inputs=[
137
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
138
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
139
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
140
+ gr.Slider(
141
+ minimum=0.1,
142
+ maximum=1.0,
143
+ value=0.95,
144
+ step=0.05,
145
+ label="Top-p (nucleus sampling)",
146
+ ),
147
+ ],
148
+ )
149
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  model.eval()
151
  demo.launch()