gabrielclark3330 commited on
Commit
7b8b167
·
1 Parent(s): c941cf9

Trying to work out concurency

Browse files
Files changed (1) hide show
  1. app.py +60 -21
app.py CHANGED
@@ -1,4 +1,3 @@
1
- '''
2
  import os
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -6,6 +5,7 @@ import torch
6
 
7
  model_name_2_7B_instruct = "Zyphra/Zamba2-2.7B-instruct"
8
  model_name_7B_instruct = "Zyphra/Zamba2-7B-instruct"
 
9
 
10
  tokenizer_2_7B_instruct = AutoTokenizer.from_pretrained(model_name_2_7B_instruct)
11
  model_2_7B_instruct = AutoModelForCausalLM.from_pretrained(
@@ -41,7 +41,16 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
41
  sample.append({'role': 'assistant', 'content': turn[1]})
42
  chat_sample = tokenizer_2_7B_instruct.apply_chat_template(sample, tokenize=False)
43
  input_ids = tokenizer_2_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_2_7B_instruct.device)
44
- outputs = model_2_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
 
 
 
 
 
 
 
 
 
45
  """
46
  outputs = model_2_7B_instruct.generate(
47
  input_ids=input_ids,
@@ -59,6 +68,11 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
59
  """
60
  generated_text = tokenizer_2_7B_instruct.decode(outputs[0])
61
  assistant_response = extract_assistant_response(generated_text)
 
 
 
 
 
62
  return assistant_response
63
 
64
  def generate_response_7B_instruct(chat_history, max_new_tokens):
@@ -70,7 +84,16 @@ def generate_response_7B_instruct(chat_history, max_new_tokens):
70
  sample.append({'role': 'assistant', 'content': turn[1]})
71
  chat_sample = tokenizer_7B_instruct.apply_chat_template(sample, tokenize=False)
72
  input_ids = tokenizer_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_7B_instruct.device)
73
- outputs = model_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
 
 
 
 
 
 
 
 
 
74
  """
75
  outputs = model_7B_instruct.generate(
76
  input_ids=input_ids,
@@ -88,6 +111,11 @@ def generate_response_7B_instruct(chat_history, max_new_tokens):
88
  """
89
  generated_text = tokenizer_7B_instruct.decode(outputs[0])
90
  assistant_response = extract_assistant_response(generated_text)
 
 
 
 
 
91
  return assistant_response
92
 
93
  with gr.Blocks() as demo:
@@ -222,18 +250,23 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
222
  if input_ids.size(1) > max_input_length:
223
  input_ids = input_ids[:, -max_input_length:] # Truncate from the left (oldest tokens)
224
 
225
- outputs = model_2_7B_instruct.generate(
226
- input_ids=input_ids,
227
- max_new_tokens=max_new_tokens,
228
- return_dict_in_generate=False,
229
- output_scores=False,
230
- use_cache=True,
231
- num_beams=1,
232
- do_sample=False
233
- )
 
234
 
235
  generated_text = tokenizer_2_7B_instruct.decode(outputs[0])
236
  assistant_response = extract_assistant_response(generated_text)
 
 
 
 
237
  return assistant_response
238
 
239
  def generate_response_7B_instruct(chat_history, max_new_tokens):
@@ -253,18 +286,23 @@ def generate_response_7B_instruct(chat_history, max_new_tokens):
253
  if input_ids.size(1) > max_input_length:
254
  input_ids = input_ids[:, -max_input_length:] # Truncate from the left (oldest tokens)
255
 
256
- outputs = model_7B_instruct.generate(
257
- input_ids=input_ids,
258
- max_new_tokens=max_new_tokens,
259
- return_dict_in_generate=False,
260
- output_scores=False,
261
- use_cache=True,
262
- num_beams=1,
263
- do_sample=False
264
- )
 
265
 
266
  generated_text = tokenizer_7B_instruct.decode(outputs[0])
267
  assistant_response = extract_assistant_response(generated_text)
 
 
 
 
268
  return assistant_response
269
 
270
  with gr.Blocks() as demo:
@@ -335,3 +373,4 @@ with gr.Blocks() as demo:
335
 
336
  if __name__ == "__main__":
337
  demo.queue().launch()
 
 
 
1
  import os
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
5
 
6
  model_name_2_7B_instruct = "Zyphra/Zamba2-2.7B-instruct"
7
  model_name_7B_instruct = "Zyphra/Zamba2-7B-instruct"
8
+ max_context_length = 4096
9
 
10
  tokenizer_2_7B_instruct = AutoTokenizer.from_pretrained(model_name_2_7B_instruct)
11
  model_2_7B_instruct = AutoModelForCausalLM.from_pretrained(
 
41
  sample.append({'role': 'assistant', 'content': turn[1]})
42
  chat_sample = tokenizer_2_7B_instruct.apply_chat_template(sample, tokenize=False)
43
  input_ids = tokenizer_2_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_2_7B_instruct.device)
44
+
45
+ max_new_tokens = int(max_new_tokens)
46
+ max_input_length = max_context_length - max_new_tokens
47
+ if input_ids['input_ids'].size(1) > max_input_length:
48
+ input_ids['input_ids'] = input_ids['input_ids'][:, -max_input_length:]
49
+ if 'attention_mask' in input_ids:
50
+ input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
51
+
52
+ with torch.no_grad():
53
+ outputs = model_2_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
54
  """
55
  outputs = model_2_7B_instruct.generate(
56
  input_ids=input_ids,
 
68
  """
69
  generated_text = tokenizer_2_7B_instruct.decode(outputs[0])
70
  assistant_response = extract_assistant_response(generated_text)
71
+
72
+ del input_ids
73
+ del outputs
74
+ torch.cuda.empty_cache()
75
+
76
  return assistant_response
77
 
78
  def generate_response_7B_instruct(chat_history, max_new_tokens):
 
84
  sample.append({'role': 'assistant', 'content': turn[1]})
85
  chat_sample = tokenizer_7B_instruct.apply_chat_template(sample, tokenize=False)
86
  input_ids = tokenizer_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_7B_instruct.device)
87
+
88
+ max_new_tokens = int(max_new_tokens)
89
+ max_input_length = max_context_length - max_new_tokens
90
+ if input_ids['input_ids'].size(1) > max_input_length:
91
+ input_ids['input_ids'] = input_ids['input_ids'][:, -max_input_length:]
92
+ if 'attention_mask' in input_ids:
93
+ input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
94
+
95
+ with torch.no_grad():
96
+ outputs = model_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
97
  """
98
  outputs = model_7B_instruct.generate(
99
  input_ids=input_ids,
 
111
  """
112
  generated_text = tokenizer_7B_instruct.decode(outputs[0])
113
  assistant_response = extract_assistant_response(generated_text)
114
+
115
+ del input_ids
116
+ del outputs
117
+ torch.cuda.empty_cache()
118
+
119
  return assistant_response
120
 
121
  with gr.Blocks() as demo:
 
250
  if input_ids.size(1) > max_input_length:
251
  input_ids = input_ids[:, -max_input_length:] # Truncate from the left (oldest tokens)
252
 
253
+ with torch.no_grad():
254
+ outputs = model_2_7B_instruct.generate(
255
+ input_ids=input_ids,
256
+ max_new_tokens=max_new_tokens,
257
+ return_dict_in_generate=False,
258
+ output_scores=False,
259
+ use_cache=True,
260
+ num_beams=1,
261
+ do_sample=False
262
+ )
263
 
264
  generated_text = tokenizer_2_7B_instruct.decode(outputs[0])
265
  assistant_response = extract_assistant_response(generated_text)
266
+
267
+ del input_ids
268
+ del outputs
269
+ torch.cuda.empty_cache()
270
  return assistant_response
271
 
272
  def generate_response_7B_instruct(chat_history, max_new_tokens):
 
286
  if input_ids.size(1) > max_input_length:
287
  input_ids = input_ids[:, -max_input_length:] # Truncate from the left (oldest tokens)
288
 
289
+ with torch.no_grad():
290
+ outputs = model_7B_instruct.generate(
291
+ input_ids=input_ids,
292
+ max_new_tokens=max_new_tokens,
293
+ return_dict_in_generate=False,
294
+ output_scores=False,
295
+ use_cache=True,
296
+ num_beams=1,
297
+ do_sample=False
298
+ )
299
 
300
  generated_text = tokenizer_7B_instruct.decode(outputs[0])
301
  assistant_response = extract_assistant_response(generated_text)
302
+
303
+ del input_ids
304
+ del outputs
305
+ torch.cuda.empty_cache()
306
  return assistant_response
307
 
308
  with gr.Blocks() as demo:
 
373
 
374
  if __name__ == "__main__":
375
  demo.queue().launch()
376
+ '''