VictorSanh commited on
Commit
dfc9234
·
1 Parent(s): 7073167

updated version

Browse files
Files changed (1) hide show
  1. playground.py +83 -101
playground.py CHANGED
@@ -1,50 +1,38 @@
1
  import copy
2
- import hashlib
3
  import os
4
- import re
5
- # import spaces
6
  import subprocess
7
  import torch
8
- import PIL
9
 
10
- from pathlib import Path
11
  from threading import Thread
12
- from typing import List, Optional, Tuple
13
  from urllib.parse import urlparse
14
  from PIL import Image
15
 
16
  import gradio as gr
17
- from gradio import processing_utils
18
  from gradio_client.client import DEFAULT_TEMP_DIR
19
- from transformers import AutoProcessor, AutoModelForCausalLM, TextIteratorStreamer, logging
20
  from transformers.image_utils import to_numpy_array, PILImageResampling, ChannelDimension
21
  from transformers.image_transforms import resize, to_channel_dimension_format
22
 
23
- # subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
24
 
25
  DEVICE = torch.device("cuda")
26
  MODELS = {
27
- "284 - neftune - opt 18'500": AutoModelForCausalLM.from_pretrained(
28
  "HuggingFaceM4/idefics2",
29
  trust_remote_code=True,
30
  torch_dtype=torch.bfloat16,
31
  token=os.environ["HF_AUTH_TOKEN"],
32
- revision="1e05755c1c5cb2077a0f60b83ea1368c22a17282",
 
 
 
 
 
 
 
33
  ).to(DEVICE),
34
- # "279bis - baseline - opt 18'500": AutoModelForCausalLM.from_pretrained(
35
- # "HuggingFaceM4/idefics2",
36
- # trust_remote_code=True,
37
- # torch_dtype=torch.bfloat16,
38
- # token=os.environ["HF_AUTH_TOKEN"],
39
- # revision="5cd3c3a3eb5e0ea664f5ac09e73c9ef42da93a86",
40
- # ).to(DEVICE),
41
- # "286 - mix6 tables - opt 20'000": AutoModelForCausalLM.from_pretrained(
42
- # "HuggingFaceM4/idefics2",
43
- # trust_remote_code=True,
44
- # torch_dtype=torch.bfloat16,
45
- # token=os.environ["HF_AUTH_TOKEN"],
46
- # revision="b473d49caa964991b40b79fe7cb27d51d4d023f6",
47
- # ).to(DEVICE),
48
  # "285 - continued pretraining on text sft - opt 2'000": AutoModelForCausalLM.from_pretrained(
49
  # "HuggingFaceM4/idefics2",
50
  # trust_remote_code=True,
@@ -247,16 +235,16 @@ def format_user_prompt_with_im_history_and_system_conditioning(
247
  return resulting_list
248
 
249
 
250
- # @spaces.GPU(duration=180)
251
  def model_inference(
252
  user_prompt,
253
  chat_history,
 
254
  decoding_strategy,
255
  temperature,
256
  max_new_tokens,
257
  repetition_penalty,
258
  top_p,
259
- model_selector,
260
  ):
261
  if user_prompt["text"].strip() == "" and not user_prompt["files"]:
262
  gr.Error("Please input a query and optionally image(s).")
@@ -276,6 +264,7 @@ def model_inference(
276
  streamer = TextIteratorStreamer(
277
  PROCESSOR.tokenizer,
278
  skip_prompt=True,
 
279
  )
280
 
281
  # Common parameters to all decoding strategies
@@ -302,41 +291,86 @@ def model_inference(
302
 
303
  # Creating model inputs
304
  input_text, images = prompt_list_to_model_input(formated_prompt_list)
305
- print(input_text)
306
  inputs = create_model_inputs([input_text], [images])
307
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
308
  generation_args.update(inputs)
309
 
310
- print("1")
 
 
 
 
 
311
  thread = Thread(
312
  target=MODELS[model_selector].generate,
313
  kwargs=generation_args,
314
  )
315
- print("2")
316
  thread.start()
317
- acc_text = ""
318
  print("start generating")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
- for text_token in streamer:
321
- acc_text += text_token
322
- yield acc_text
323
- # last_turn = chat_history.pop(-1)
324
- # last_turn[-1] += acc_text
325
- # if last_turn[-1].endswith("\nUser"):
326
- # # Safeguard: sometimes (rarely), the model won't generate the token `<end_of_utterance>` and will go directly to generating `\nUser:`
327
- # # It will thus stop the generation on `\nUser:`. But when it exits, it will have already generated `\nUser`
328
- # # This post-processing ensures that we don't have an additional `\nUser` wandering around.
329
- # last_turn[-1] = last_turn[-1][:-5]
330
- # chat_history.append(last_turn)
331
- # yield "", None, chat_history
332
- # acc_text = ""
333
-
334
-
335
- with gr.Blocks() as demo:
336
  with gr.Row(elem_id="model_selector_row"):
337
  model_selector = gr.Dropdown(
338
  choices=MODELS.keys(),
339
- value="284 - neftune - opt 18'500",
340
  interactive=True,
341
  show_label=False,
342
  container=False,
@@ -344,57 +378,6 @@ with gr.Blocks() as demo:
344
  visible=True,
345
  )
346
 
347
- # Hyper-parameters for generation
348
- max_new_tokens = gr.Slider(
349
- minimum=8,
350
- maximum=1024,
351
- value=512,
352
- step=1,
353
- interactive=True,
354
- label="Maximum number of new tokens to generate",
355
- visible=False,
356
- )
357
- repetition_penalty = gr.Slider(
358
- minimum=0.01,
359
- maximum=5.0,
360
- value=1.0,
361
- step=0.01,
362
- interactive=True,
363
- label="Repetition penalty",
364
- info="1.0 is equivalent to no penalty",
365
- visible=False,
366
- )
367
- decoding_strategy = gr.Radio(
368
- [
369
- "Greedy",
370
- "Top P Sampling",
371
- ],
372
- value="Greedy",
373
- label="Decoding strategy",
374
- interactive=True,
375
- info="Higher values is equivalent to sampling more low-probability tokens.",
376
- visible=False,
377
- )
378
- temperature = gr.Slider(
379
- minimum=0.0,
380
- maximum=5.0,
381
- value=0.4,
382
- step=0.1,
383
- interactive=True,
384
- visible=False,
385
- label="Sampling temperature",
386
- info="Higher values will produce more diverse outputs.",
387
- )
388
- top_p = gr.Slider(
389
- minimum=0.01,
390
- maximum=0.99,
391
- value=0.8,
392
- step=0.01,
393
- interactive=True,
394
- visible=False,
395
- label="Top P",
396
- info="Higher values is equivalent to sampling more low-probability tokens.",
397
- )
398
  decoding_strategy.change(
399
  fn=lambda selection: gr.Slider(
400
  visible=(
@@ -415,8 +398,7 @@ with gr.Blocks() as demo:
415
  # examples=[{"text": "hello"}, {"text": "hola"}, {"text": "merhaba"}],
416
  title="Echo Bot",
417
  multimodal=True,
418
- additional_inputs=[decoding_strategy, temperature, max_new_tokens, repetition_penalty, top_p, model_selector],
419
  )
420
 
421
-
422
  demo.launch()
 
1
  import copy
 
2
  import os
3
+ import spaces
 
4
  import subprocess
5
  import torch
 
6
 
 
7
  from threading import Thread
8
+ from typing import List, Tuple
9
  from urllib.parse import urlparse
10
  from PIL import Image
11
 
12
  import gradio as gr
 
13
  from gradio_client.client import DEFAULT_TEMP_DIR
14
+ from transformers import AutoProcessor, AutoModelForCausalLM, TextIteratorStreamer
15
  from transformers.image_utils import to_numpy_array, PILImageResampling, ChannelDimension
16
  from transformers.image_transforms import resize, to_channel_dimension_format
17
 
18
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
19
 
20
  DEVICE = torch.device("cuda")
21
  MODELS = {
22
+ "282 - mix1 fixed - opt 23'000": AutoModelForCausalLM.from_pretrained(
23
  "HuggingFaceM4/idefics2",
24
  trust_remote_code=True,
25
  torch_dtype=torch.bfloat16,
26
  token=os.environ["HF_AUTH_TOKEN"],
27
+ revision="a1bc6a2b0f74cde25844144f602dde2808a564d9",
28
+ ).to(DEVICE),
29
+ "286 - mix6 tables - opt 20'000": AutoModelForCausalLM.from_pretrained(
30
+ "HuggingFaceM4/idefics2",
31
+ trust_remote_code=True,
32
+ torch_dtype=torch.bfloat16,
33
+ token=os.environ["HF_AUTH_TOKEN"],
34
+ revision="b473d49caa964991b40b79fe7cb27d51d4d023f6",
35
  ).to(DEVICE),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # "285 - continued pretraining on text sft - opt 2'000": AutoModelForCausalLM.from_pretrained(
37
  # "HuggingFaceM4/idefics2",
38
  # trust_remote_code=True,
 
235
  return resulting_list
236
 
237
 
238
+ @spaces.GPU(duration=180)
239
  def model_inference(
240
  user_prompt,
241
  chat_history,
242
+ model_selector,
243
  decoding_strategy,
244
  temperature,
245
  max_new_tokens,
246
  repetition_penalty,
247
  top_p,
 
248
  ):
249
  if user_prompt["text"].strip() == "" and not user_prompt["files"]:
250
  gr.Error("Please input a query and optionally image(s).")
 
264
  streamer = TextIteratorStreamer(
265
  PROCESSOR.tokenizer,
266
  skip_prompt=True,
267
+ timeout=5.,
268
  )
269
 
270
  # Common parameters to all decoding strategies
 
291
 
292
  # Creating model inputs
293
  input_text, images = prompt_list_to_model_input(formated_prompt_list)
 
294
  inputs = create_model_inputs([input_text], [images])
295
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
296
  generation_args.update(inputs)
297
 
298
+ # # The regular non streaming generation mode
299
+ # _ = generation_args.pop("streamer")
300
+ # generated_ids = MODELS[model_selector].generate(**generation_args)
301
+ # generated_text = PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0]
302
+ # return generated_text
303
+
304
  thread = Thread(
305
  target=MODELS[model_selector].generate,
306
  kwargs=generation_args,
307
  )
 
308
  thread.start()
309
+
310
  print("start generating")
311
+ acc_text = ""
312
+ try:
313
+ for text_token in streamer:
314
+ acc_text += text_token
315
+ yield acc_text
316
+ except Exception as e:
317
+ print("error")
318
+ gr.Error(e)
319
+ print("success")
320
+
321
+
322
+ # Hyper-parameters for generation
323
+ max_new_tokens = gr.Slider(
324
+ minimum=8,
325
+ maximum=1024,
326
+ value=512,
327
+ step=1,
328
+ interactive=True,
329
+ label="Maximum number of new tokens to generate",
330
+ )
331
+ repetition_penalty = gr.Slider(
332
+ minimum=0.01,
333
+ maximum=5.0,
334
+ value=1.0,
335
+ step=0.01,
336
+ interactive=True,
337
+ label="Repetition penalty",
338
+ info="1.0 is equivalent to no penalty",
339
+ )
340
+ decoding_strategy = gr.Radio(
341
+ [
342
+ "Greedy",
343
+ "Top P Sampling",
344
+ ],
345
+ value="Greedy",
346
+ label="Decoding strategy",
347
+ interactive=True,
348
+ info="Higher values is equivalent to sampling more low-probability tokens.",
349
+ )
350
+ temperature = gr.Slider(
351
+ minimum=0.0,
352
+ maximum=5.0,
353
+ value=0.4,
354
+ step=0.1,
355
+ interactive=True,
356
+ label="Sampling temperature",
357
+ info="Higher values will produce more diverse outputs.",
358
+ )
359
+ top_p = gr.Slider(
360
+ minimum=0.01,
361
+ maximum=0.99,
362
+ value=0.8,
363
+ step=0.01,
364
+ interactive=True,
365
+ label="Top P",
366
+ info="Higher values is equivalent to sampling more low-probability tokens.",
367
+ )
368
 
369
+ with gr.Blocks(fill_height=True) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  with gr.Row(elem_id="model_selector_row"):
371
  model_selector = gr.Dropdown(
372
  choices=MODELS.keys(),
373
+ value=list(MODELS.keys())[0],
374
  interactive=True,
375
  show_label=False,
376
  container=False,
 
378
  visible=True,
379
  )
380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  decoding_strategy.change(
382
  fn=lambda selection: gr.Slider(
383
  visible=(
 
398
  # examples=[{"text": "hello"}, {"text": "hola"}, {"text": "merhaba"}],
399
  title="Echo Bot",
400
  multimodal=True,
401
+ additional_inputs=[model_selector, decoding_strategy, temperature, max_new_tokens, repetition_penalty, top_p],
402
  )
403
 
 
404
  demo.launch()