TestAgain commited on
Commit
169d4ae
·
1 Parent(s): cdc0025
Files changed (3) hide show
  1. app.py +104 -0
  2. example_images/document.png +0 -0
  3. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
3
+ from transformers.image_utils import load_image
4
+ from threading import Thread
5
+ import torch
6
+ import spaces
7
+
8
+ MODEL_ID = "TIGER-Lab/VL-Rethinker-7B"
9
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
10
+ model = AutoModelForImageTextToText.from_pretrained(
11
+ MODEL_ID,
12
+ trust_remote_code=True,
13
+ torch_dtype=torch.bfloat16
14
+ ).to("cuda").eval()
15
+
16
+ @spaces.GPU
17
+ def model_inference(input_dict, history):
18
+ text = input_dict["text"]
19
+ files = input_dict["files"]
20
+
21
+ """
22
+ Create chat history
23
+
24
+ Example history value:
25
+ [
26
+ [('pixel.png',), None],
27
+ ['ignore this image. just say "hi" and nothing else', 'Hi!'],
28
+ ['just say "hi" and nothing else', 'Hi!']
29
+ ]
30
+ """
31
+ all_images = []
32
+ current_message_images = []
33
+ messages = []
34
+
35
+ for val in history:
36
+ if val[0]:
37
+ if isinstance(val[0], str):
38
+ messages.append({
39
+ "role": "user",
40
+ "content": [
41
+ *[{"type": "image", "image": image} for image in current_message_images],
42
+ {"type": "text", "text": val[0]},
43
+ ],
44
+ })
45
+ current_message_images = []
46
+
47
+ else:
48
+ # Load messages. These will be appended to the first user text message that comes after
49
+ current_message_images = [load_image(image) for image in val[0]]
50
+ all_images += current_message_images
51
+
52
+ if val[1]:
53
+ messages.append({"role": "assistant", "content": val[1]})
54
+
55
+ current_message_images = [load_image(image) for image in files]
56
+ all_images += current_message_images
57
+ messages.append({
58
+ "role": "user",
59
+ "content": [
60
+ *[{"type": "image", "image": image} for image in current_message_images],
61
+ {"type": "text", "text": text},
62
+ ],
63
+ })
64
+
65
+ print(messages)
66
+
67
+ """
68
+ Generate and stream text
69
+ """
70
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
71
+ inputs = processor(
72
+ text=[prompt],
73
+ images=all_images if all_images else None,
74
+ return_tensors="pt",
75
+ padding=True,
76
+ ).to("cuda")
77
+
78
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
79
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
80
+
81
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
82
+ thread.start()
83
+
84
+ buffer = ""
85
+ for new_text in streamer:
86
+ buffer += new_text
87
+ yield buffer
88
+
89
+ examples = [
90
+ [{"text": "Solve this question.", "files": ["example_images/document.png"]}]
91
+ ]
92
+
93
+ demo = gr.ChatInterface(
94
+ fn=model_inference,
95
+ description="# **VL-Rethinker-7B**",
96
+ examples=examples,
97
+ fill_height=True,
98
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
99
+ stop_btn="Stop Generation",
100
+ multimodal=True,
101
+ cache_examples=False,
102
+ )
103
+
104
+ demo.launch(debug=True)
example_images/document.png ADDED
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio_client==1.3.0
2
+ qwen-vl-utils==0.0.2
3
+ transformers-stream-generator==0.0.4
4
+ torch==2.4.0
5
+ torchvision==0.19.0
6
+ git+https://github.com/huggingface/transformers.git
7
+ accelerate
8
+ av
9
+ opencv-python