Pectics commited on
Commit
dc15a3f
·
verified ·
1 Parent(s): af0c8f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -15
app.py CHANGED
@@ -1,26 +1,40 @@
1
- from gradio import ChatInterface, Textbox, Slider
2
- from spaces import GPU
3
  from threading import Thread
4
- from torch import bfloat16
5
  from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
 
 
 
 
6
  from qwen_vl_utils import process_vision_info
7
 
8
  model_path = "Pectics/Softie-VL-7B-250123"
9
 
10
  model = Qwen2VLForConditionalGeneration.from_pretrained(
11
  model_path,
12
- torch_dtype=bfloat16,
13
- attn_implementation="flash_attention_2",
14
  device_map="auto",
 
15
  )
16
  min_pixels = 256 * 28 * 28
17
  max_pixels = 1280 * 28 * 28
18
  processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
19
 
20
  @GPU
21
- def infer(inputs: BatchFeature, streamer, kwargs: dict):
 
 
 
 
 
22
  inputs = inputs.to("cuda")
23
- thread = Thread(target=model.generate, kwargs={**inputs, **kwargs})
 
 
 
 
 
 
 
 
24
  thread.start()
25
  response = ""
26
  for token in streamer:
@@ -48,14 +62,7 @@ def respond(
48
  padding = True,
49
  return_tensors = "pt",
50
  )
51
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
52
- kwargs = dict(
53
- streamer=streamer,
54
- max_new_tokens=max_tokens,
55
- temperature=temperature,
56
- top_p=top_p,
57
- )
58
- for response in infer(inputs, streamer, kwargs):
59
  yield response
60
 
61
  app = ChatInterface(
 
 
 
1
  from threading import Thread
 
2
  from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
3
+
4
+ from gradio import ChatInterface, Textbox, Slider
5
+ from spaces import GPU
6
+
7
  from qwen_vl_utils import process_vision_info
8
 
9
  model_path = "Pectics/Softie-VL-7B-250123"
10
 
11
  model = Qwen2VLForConditionalGeneration.from_pretrained(
12
  model_path,
13
+ torch_dtype="auto",
 
14
  device_map="auto",
15
+ attn_implementation="flash_attention_2",
16
  )
17
  min_pixels = 256 * 28 * 28
18
  max_pixels = 1280 * 28 * 28
19
  processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
20
 
21
  @GPU
22
+ def infer(
23
+ inputs: BatchFeature,
24
+ max_tokens: int,
25
+ temperature: float,
26
+ top_p: float,
27
+ ):
28
  inputs = inputs.to("cuda")
29
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
30
+ kwargs = dict(
31
+ **inputs,
32
+ streamer=streamer,
33
+ max_new_tokens=max_tokens,
34
+ temperature=temperature,
35
+ top_p=top_p,
36
+ )
37
+ thread = Thread(target=model.generate, kwargs=kwargs)
38
  thread.start()
39
  response = ""
40
  for token in streamer:
 
62
  padding = True,
63
  return_tensors = "pt",
64
  )
65
+ for response in infer(inputs, max_tokens, temperature, top_p):
 
 
 
 
 
 
 
66
  yield response
67
 
68
  app = ChatInterface(