prithivMLmods commited on
Commit
d959d44
·
verified ·
1 Parent(s): d8019dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -160
app.py CHANGED
@@ -1,161 +1,161 @@
1
- import os
2
- import time
3
- import threading
4
- import gradio as gr
5
- import spaces
6
- import torch
7
- from PIL import Image
8
- from transformers import (
9
- AutoModelForImageTextToText,
10
- AutoProcessor,
11
- TextIteratorStreamer,
12
- )
13
- from transformers.image_utils import load_image
14
-
15
- # Constants for text generation
16
- MAX_MAX_NEW_TOKENS = 4096
17
- DEFAULT_MAX_NEW_TOKENS = 1024
18
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
19
-
20
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
-
22
- # Load LFM2-VL-1.6B
23
- MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
24
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
25
- model_m = AutoModelForImageTextToText.from_pretrained(
26
- MODEL_ID_M,
27
- trust_remote_code=True,
28
- torch_dtype="bfloat16",
29
- ).to(device).eval()
30
-
31
- # Load LFM2-VL-450M
32
- MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
33
- processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
34
- model_t = AutoModelForImageTextToText.from_pretrained(
35
- MODEL_ID_T,
36
- trust_remote_code=True,
37
- torch_dtype="bfloat16",
38
- ).to(device).eval()
39
-
40
- @spaces.GPU
41
- def generate_image(model_name: str, text: str, image: Image.Image,
42
- max_new_tokens: int = 1024,
43
- temperature: float = 0.6,
44
- top_p: float = 0.9,
45
- top_k: int = 50,
46
- repetition_penalty: float = 1.2):
47
- """
48
- Generate responses using the selected model for image input.
49
- """
50
- if model_name == "LFM2-VL-1.6B":
51
- processor = processor_m
52
- model = model_m
53
- elif model_name == "LFM2-VL-450M":
54
- processor = processor_t
55
- model = model_t
56
- else:
57
- yield "Invalid model selected.", "Invalid model selected."
58
- return
59
-
60
- if image is None:
61
- yield "Please upload an image.", "Please upload an image."
62
- return
63
-
64
- messages = [{
65
- "role": "user",
66
- "content": [
67
- {"type": "image", "image": image},
68
- {"type": "text", "text": text},
69
- ]
70
- }]
71
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
72
- inputs = processor(
73
- text=[prompt_full],
74
- images=[image],
75
- return_tensors="pt",
76
- padding=True,
77
- truncation=False,
78
- max_length=MAX_INPUT_TOKEN_LENGTH
79
- ).to(device)
80
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
81
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
82
- thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
83
- thread.start()
84
- buffer = ""
85
- for new_text in streamer:
86
- buffer += new_text
87
- time.sleep(0.01)
88
- yield buffer, buffer
89
-
90
- # Define examples for image inference
91
- image_examples = [
92
- ["According to this diagram, where do severe droughts occur?", "images/1.png"],
93
- ["Could you describe this image?", "images/2.jpg"],
94
- ["Provide a description of this image.", "images/3.jpg"],
95
- ["Explain the movie shot in detail.", "images/4.png"],
96
- ]
97
-
98
- # Updated CSS with model choice highlighting
99
- css = """
100
- .submit-btn {
101
- background-color: #2980b9 !important;
102
- color: white !important;
103
- }
104
- .submit-btn:hover {
105
- background-color: #3498db !important;
106
- }
107
- .canvas-output {
108
- border: 2px solid #4682B4;
109
- border-radius: 10px;
110
- padding: 20px;
111
- }
112
- """
113
-
114
- # Create the Gradio Interface
115
- with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
116
- gr.Markdown("# **LFM2-VL by [LiquidAI](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)**")
117
- with gr.Row():
118
- with gr.Column():
119
- image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
120
- image_upload = gr.Image(type="pil", label="Image")
121
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
122
- gr.Examples(
123
- examples=image_examples,
124
- inputs=[image_query, image_upload]
125
- )
126
-
127
- with gr.Accordion("Advanced options", open=False):
128
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
129
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
130
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
131
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
132
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
133
-
134
- with gr.Column():
135
- with gr.Column(elem_classes="canvas-output"):
136
- gr.Markdown("## Output")
137
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
138
- with gr.Accordion("(Result.md)", open=False):
139
- markdown_output = gr.Markdown(label="(Result.md)")
140
-
141
- model_choice = gr.Dropdown(
142
- choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
143
- label="Select Model",
144
- value="LFM2-VL-1.6B"
145
- )
146
-
147
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
148
- gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
149
- gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")
150
-
151
- # Define the submit button action
152
- image_submit.click(fn=generate_image,
153
- inputs=[
154
- model_choice, image_query, image_upload,
155
- max_new_tokens, temperature, top_p, top_k,
156
- repetition_penalty
157
- ],
158
- outputs=[output, markdown_output])
159
-
160
- if __name__ == "__main__":
161
  demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
1
+ import os
2
+ import time
3
+ import threading
4
+ import gradio as gr
5
+ import spaces
6
+ import torch
7
+ from PIL import Image
8
+ from transformers import (
9
+ AutoModelForImageTextToText,
10
+ AutoProcessor,
11
+ TextIteratorStreamer,
12
+ )
13
+ from transformers.image_utils import load_image
14
+
15
+ # Constants for text generation
16
+ MAX_MAX_NEW_TOKENS = 4096
17
+ DEFAULT_MAX_NEW_TOKENS = 1024
18
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
19
+
20
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
+
22
+ # Load LFM2-VL-1.6B
23
+ MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
24
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
25
+ model_m = AutoModelForImageTextToText.from_pretrained(
26
+ MODEL_ID_M,
27
+ trust_remote_code=True,
28
+ torch_dtype="bfloat16",
29
+ ).to(device).eval()
30
+
31
+ # Load LFM2-VL-450M
32
+ MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
33
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
34
+ model_t = AutoModelForImageTextToText.from_pretrained(
35
+ MODEL_ID_T,
36
+ trust_remote_code=True,
37
+ torch_dtype="bfloat16",
38
+ ).to(device).eval()
39
+
40
+ @spaces.GPU
41
+ def generate_image(model_name: str, text: str, image: Image.Image,
42
+ max_new_tokens: int = 1024,
43
+ temperature: float = 0.6,
44
+ top_p: float = 0.9,
45
+ top_k: int = 50,
46
+ repetition_penalty: float = 1.2):
47
+ """
48
+ Generate responses using the selected model for image input.
49
+ """
50
+ if model_name == "LFM2-VL-1.6B":
51
+ processor = processor_m
52
+ model = model_m
53
+ elif model_name == "LFM2-VL-450M":
54
+ processor = processor_t
55
+ model = model_t
56
+ else:
57
+ yield "Invalid model selected.", "Invalid model selected."
58
+ return
59
+
60
+ if image is None:
61
+ yield "Please upload an image.", "Please upload an image."
62
+ return
63
+
64
+ messages = [{
65
+ "role": "user",
66
+ "content": [
67
+ {"type": "image", "image": image},
68
+ {"type": "text", "text": text},
69
+ ]
70
+ }]
71
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
72
+ inputs = processor(
73
+ text=[prompt_full],
74
+ images=[image],
75
+ return_tensors="pt",
76
+ padding=True,
77
+ truncation=False,
78
+ max_length=MAX_INPUT_TOKEN_LENGTH
79
+ ).to(device)
80
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
81
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
82
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
83
+ thread.start()
84
+ buffer = ""
85
+ for new_text in streamer:
86
+ buffer += new_text
87
+ time.sleep(0.01)
88
+ yield buffer, buffer
89
+
90
+ # Define examples for image inference
91
+ image_examples = [
92
+ ["Explain the movie shot in detail.", "images/4.png"],
93
+ ["According to this diagram, where do severe droughts occur?", "images/1.png"],
94
+ ["Could you describe this image?", "images/2.jpg"],
95
+ ["Provide a description of this image.", "images/3.jpg"],
96
+ ]
97
+
98
+ # Updated CSS with model choice highlighting
99
+ css = """
100
+ .submit-btn {
101
+ background-color: #2980b9 !important;
102
+ color: white !important;
103
+ }
104
+ .submit-btn:hover {
105
+ background-color: #3498db !important;
106
+ }
107
+ .canvas-output {
108
+ border: 2px solid #4682B4;
109
+ border-radius: 10px;
110
+ padding: 20px;
111
+ }
112
+ """
113
+
114
+ # Create the Gradio Interface
115
+ with gr.Blocks(css=css, theme="gstaff/sketch") as demo:
116
+ gr.Markdown("# **LFM2-VL by [LiquidAI](https://huggingface.co/LiquidAI)**")
117
+ with gr.Row():
118
+ with gr.Column():
119
+ image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
120
+ image_upload = gr.Image(type="pil", label="Image")
121
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
122
+ gr.Examples(
123
+ examples=image_examples,
124
+ inputs=[image_query, image_upload]
125
+ )
126
+
127
+ with gr.Accordion("Advanced options", open=False):
128
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
129
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
130
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
131
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
132
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
133
+
134
+ with gr.Column():
135
+ with gr.Column(elem_classes="canvas-output"):
136
+ gr.Markdown("## Output")
137
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
138
+ with gr.Accordion("(Result.md)", open=False):
139
+ markdown_output = gr.Markdown(label="(Result.md)")
140
+
141
+ model_choice = gr.Dropdown(
142
+ choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
143
+ label="Select Model",
144
+ value="LFM2-VL-1.6B"
145
+ )
146
+
147
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
148
+ gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
149
+ gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")
150
+
151
+ # Define the submit button action
152
+ image_submit.click(fn=generate_image,
153
+ inputs=[
154
+ model_choice, image_query, image_upload,
155
+ max_new_tokens, temperature, top_p, top_k,
156
+ repetition_penalty
157
+ ],
158
+ outputs=[output, markdown_output])
159
+
160
+ if __name__ == "__main__":
161
  demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)