wambugu71 commited on
Commit
750ea06
·
verified ·
1 Parent(s): 449af10

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -0
app.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ from torch.cuda.amp import autocast
5
+ import subprocess
6
+ from huggingface_hub import InferenceClient
7
+ import os
8
+ import psutil
9
+
10
+
11
+
12
+ """
13
+ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
14
+ """
15
+
16
+ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
17
+ from accelerate import Accelerator
18
+
19
+
20
+ subprocess.run(
21
+ "pip install psutil",
22
+
23
+ shell=True,
24
+ )
25
+
26
+ import bitsandbytes as bnb # Import bitsandbytes for 8-bit quantization
27
+
28
+
29
+
30
+ from datetime import datetime
31
+
32
+
33
+ subprocess.run(
34
+ "pip install flash-attn --no-build-isolation",
35
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
36
+ shell=True,
37
+ )
38
+
39
+ # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
40
+ # pip install 'git+https://github.com/huggingface/transformers.git'
41
+
42
+
43
+
44
+ token=os.getenv('token')
45
+ print('token = ',token)
46
+
47
+ from transformers import AutoModelForCausalLM, AutoTokenizer
48
+ import transformers
49
+
50
+ # model_id = "mistralai/Mistral-7B-v0.3"
51
+
52
+ model_id = "microsoft/Phi-3-medium-4k-instruct"
53
+ # model_id = "microsoft/phi-4"
54
+
55
+ # model_id = "Qwen/Qwen2-7B-Instruct"
56
+
57
+
58
+ tokenizer = AutoTokenizer.from_pretrained(
59
+ # model_id
60
+ model_id,
61
+ # use_fast=False
62
+ token= token,
63
+ trust_remote_code=True)
64
+
65
+
66
+ accelerator = Accelerator()
67
+
68
+ model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
69
+ # torch_dtype= torch.uint8,
70
+ torch_dtype=torch.bfloat16,
71
+ # load_in_8bit=True,
72
+ # # # torch_dtype=torch.fl,
73
+ attn_implementation="flash_attention_2",
74
+ low_cpu_mem_usage=True,
75
+ trust_remote_code=True,
76
+ device_map='cuda',
77
+ # device_map=accelerator.device_map,
78
+
79
+ )
80
+
81
+
82
+
83
+
84
+
85
+ #
86
+ model = accelerator.prepare(model)
87
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
88
+
89
+ pipe = pipeline(
90
+ "text-generation",
91
+ model=model,
92
+ tokenizer=tokenizer,
93
+ )
94
+
95
+
96
+
97
+
98
+ # pipeline = transformers.pipeline(
99
+ # "text-generation",
100
+ # model="microsoft/phi-4",
101
+ # model_kwargs={"torch_dtype": "auto"},
102
+ # device_map="auto",
103
+ # )
104
+
105
+
106
+ # device_map = infer_auto_device_map(model, max_memory={0: "79GB", "cpu":"65GB" })
107
+
108
+ # Load the model with the inferred device map
109
+ # model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, no_split_module_classes=["GPTJBlock"])
110
+ # model.half()
111
+
112
+ import json
113
+
114
+ def str_to_json(str_obj):
115
+ json_obj = json.loads(str_obj)
116
+ return json_obj
117
+
118
+
119
+ @spaces.GPU(duration=170)
120
+ def respond(
121
+ message,
122
+ history: list[tuple[str, str]],
123
+ system_message,
124
+ max_tokens,
125
+ temperature,
126
+ top_p,
127
+ ):
128
+ # yield 'retuend'
129
+ # model.to(accelerator.device)
130
+
131
+ messages = []
132
+ json_obj = str_to_json(message)
133
+ print(json_obj)
134
+
135
+ messages= json_obj
136
+
137
+ # input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(accelerator.device)
138
+ # input_ids2 = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt") #.to('cuda')
139
+ # print(f"Converted input_ids dtype: {input_ids.dtype}")
140
+ # input_str= str(input_ids2)
141
+ # print('input str = ', input_str)
142
+
143
+ generation_args = {
144
+ "max_new_tokens": max_tokens,
145
+ "return_full_text": False,
146
+ "temperature": temperature,
147
+ "do_sample": False,
148
+ }
149
+
150
+ output = pipe(messages, **generation_args)
151
+ print(output[0]['generated_text'])
152
+ gen_text=output[0]['generated_text']
153
+
154
+ # with torch.no_grad():
155
+ # gen_tokens = model.generate(
156
+ # input_ids,
157
+ # max_new_tokens=max_tokens,
158
+ # # do_sample=True,
159
+ # temperature=temperature,
160
+ # )
161
+
162
+ # gen_text = tokenizer.decode(gen_tokens[0])
163
+ # print(gen_text)
164
+ # gen_text= gen_text.replace(input_str,'')
165
+ # gen_text= gen_text.replace('<|im_end|>','')
166
+
167
+ yield gen_text
168
+
169
+
170
+ # messages = [
171
+ # # {"role": "user", "content": "What is your favourite condiment?"},
172
+ # # {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
173
+ # # {"role": "user", "content": "Do you have mayonnaise recipes?"}
174
+ # ]
175
+
176
+ # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
177
+
178
+ # outputs = model.generate(inputs, max_new_tokens=2000)
179
+ # gen_text=tokenizer.decode(outputs[0], skip_special_tokens=True)
180
+
181
+ # print(gen_text)
182
+ # yield gen_text
183
+ # for val in history:
184
+ # if val[0]:
185
+ # messages.append({"role": "user", "content": val[0]})
186
+ # if val[1]:
187
+ # messages.append({"role": "assistant", "content": val[1]})
188
+
189
+ # messages.append({"role": "user", "content": message})
190
+
191
+ # response = ""
192
+
193
+ # for message in client.chat_completion(
194
+ # messages,
195
+ # max_tokens=max_tokens,
196
+ # stream=True,
197
+ # temperature=temperature,
198
+ # top_p=top_p,
199
+ # ):
200
+ # token = message.choices[0].delta.content
201
+
202
+ # response += token
203
+ # yield response
204
+
205
+ """
206
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
207
+ """
208
+ demo = gr.ChatInterface(
209
+ respond,
210
+ additional_inputs=[
211
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
212
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
213
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
214
+ gr.Slider(
215
+ minimum=0.1,
216
+ maximum=1.0,
217
+ value=0.95,
218
+ step=0.05,
219
+ label="Top-p (nucleus sampling)",
220
+ ),
221
+ ],
222
+ )
223
+
224
+
225
+ if __name__ == "__main__":
226
+ demo.launch()