Pectics commited on
Commit
1325e72
·
verified ·
1 Parent(s): 2941c6d

Dec GPU usage

Browse files
Files changed (1) hide show
  1. app.py +25 -33
app.py CHANGED
@@ -1,15 +1,15 @@
1
- from gradio import ChatInterface, Textbox, Slider
2
- from spaces import GPU
3
  from threading import Thread
4
- from torch import bfloat16
5
- from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor
 
 
6
  from qwen_vl_utils import process_vision_info
7
 
8
  model_path = "Pectics/Softie-VL-7B-250123"
9
 
10
  model = Qwen2VLForConditionalGeneration.from_pretrained(
11
  model_path,
12
- torch_dtype=bfloat16,
13
  attn_implementation="flash_attention_2",
14
  device_map="auto",
15
  )
@@ -18,53 +18,45 @@ max_pixels = 1280 * 28 * 28
18
  processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
19
 
20
  @GPU
21
- def infer(
22
- messages,
 
 
 
 
 
 
23
  max_tokens,
24
  temperature,
25
  top_p,
26
  ):
 
 
 
 
27
  text_inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
28
  image_inputs, video_inputs = process_vision_info(messages)
29
  inputs = processor(
30
- text=[text_inputs],
31
- images=image_inputs,
32
- videos=video_inputs,
33
- padding=True,
34
- return_tensors="pt",
35
- ).to("cuda")
36
-
37
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
38
  kwargs = dict(
39
- **inputs,
40
  streamer=streamer,
41
  max_new_tokens=max_tokens,
42
  temperature=temperature,
43
  top_p=top_p,
44
  )
45
- thread = Thread(target=model.generate, kwargs=kwargs)
46
- thread.start()
47
-
48
  response = ""
49
  for token in streamer:
50
  response += token
51
  yield response
52
 
53
- def respond(
54
- message,
55
- history,
56
- system_message,
57
- max_tokens,
58
- temperature,
59
- top_p,
60
- ):
61
- messages = [{"role": "system", "content": system_message}]
62
- for m in history:
63
- messages.append({"role": m["role"], "content": m["content"]})
64
- messages.append({"role": "user", "content": message})
65
- for response in infer(messages, max_tokens, temperature, top_p):
66
- yield response
67
-
68
  app = ChatInterface(
69
  respond,
70
  type="messages",
 
 
 
1
  from threading import Thread
2
+ from spaces import GPU
3
+ from gradio import ChatInterface, Textbox, Slider
4
+
5
+ from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
6
  from qwen_vl_utils import process_vision_info
7
 
8
  model_path = "Pectics/Softie-VL-7B-250123"
9
 
10
  model = Qwen2VLForConditionalGeneration.from_pretrained(
11
  model_path,
12
+ torch_dtype="auto",
13
  attn_implementation="flash_attention_2",
14
  device_map="auto",
15
  )
 
18
  processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
19
 
20
  @GPU
21
+ def infer(inputs: BatchFeature, **kwargs) -> None:
22
+ inputs = inputs.to("cuda")
23
+ model.generate(**kwargs)
24
+
25
+ def respond(
26
+ message,
27
+ history,
28
+ system_message,
29
  max_tokens,
30
  temperature,
31
  top_p,
32
  ):
33
+ messages = [{"role": "system", "content": system_message}]
34
+ for m in history:
35
+ messages.append({"role": m["role"], "content": m["content"]})
36
+ messages.append({"role": "user", "content": message})
37
  text_inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
38
  image_inputs, video_inputs = process_vision_info(messages)
39
  inputs = processor(
40
+ text = [text_inputs],
41
+ images = image_inputs,
42
+ videos = video_inputs,
43
+ padding = True,
44
+ return_tensors = "pt",
45
+ )
 
46
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
47
  kwargs = dict(
48
+ inputs=inputs,
49
  streamer=streamer,
50
  max_new_tokens=max_tokens,
51
  temperature=temperature,
52
  top_p=top_p,
53
  )
54
+ Thread(target=infer, kwargs=kwargs).start()
 
 
55
  response = ""
56
  for token in streamer:
57
  response += token
58
  yield response
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  app = ChatInterface(
61
  respond,
62
  type="messages",