prithivMLmods commited on
Commit
d8019dd
·
verified ·
1 Parent(s): 8f681fc
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. app.py +161 -0
  3. images/1.png +3 -0
  4. images/2.jpg +0 -0
  5. images/3.jpg +3 -0
  6. images/4.png +0 -0
  7. requirements.txt +16 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/1.png filter=lfs diff=lfs merge=lfs -text
37
+ images/3.jpg filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import threading
4
+ import gradio as gr
5
+ import spaces
6
+ import torch
7
+ from PIL import Image
8
+ from transformers import (
9
+ AutoModelForImageTextToText,
10
+ AutoProcessor,
11
+ TextIteratorStreamer,
12
+ )
13
+ from transformers.image_utils import load_image
14
+
15
+ # Constants for text generation
16
+ MAX_MAX_NEW_TOKENS = 4096
17
+ DEFAULT_MAX_NEW_TOKENS = 1024
18
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
19
+
20
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
+
22
+ # Load LFM2-VL-1.6B
23
+ MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
24
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
25
+ model_m = AutoModelForImageTextToText.from_pretrained(
26
+ MODEL_ID_M,
27
+ trust_remote_code=True,
28
+ torch_dtype="bfloat16",
29
+ ).to(device).eval()
30
+
31
+ # Load LFM2-VL-450M
32
+ MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
33
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
34
+ model_t = AutoModelForImageTextToText.from_pretrained(
35
+ MODEL_ID_T,
36
+ trust_remote_code=True,
37
+ torch_dtype="bfloat16",
38
+ ).to(device).eval()
39
+
40
+ @spaces.GPU
41
+ def generate_image(model_name: str, text: str, image: Image.Image,
42
+ max_new_tokens: int = 1024,
43
+ temperature: float = 0.6,
44
+ top_p: float = 0.9,
45
+ top_k: int = 50,
46
+ repetition_penalty: float = 1.2):
47
+ """
48
+ Generate responses using the selected model for image input.
49
+ """
50
+ if model_name == "LFM2-VL-1.6B":
51
+ processor = processor_m
52
+ model = model_m
53
+ elif model_name == "LFM2-VL-450M":
54
+ processor = processor_t
55
+ model = model_t
56
+ else:
57
+ yield "Invalid model selected.", "Invalid model selected."
58
+ return
59
+
60
+ if image is None:
61
+ yield "Please upload an image.", "Please upload an image."
62
+ return
63
+
64
+ messages = [{
65
+ "role": "user",
66
+ "content": [
67
+ {"type": "image", "image": image},
68
+ {"type": "text", "text": text},
69
+ ]
70
+ }]
71
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
72
+ inputs = processor(
73
+ text=[prompt_full],
74
+ images=[image],
75
+ return_tensors="pt",
76
+ padding=True,
77
+ truncation=False,
78
+ max_length=MAX_INPUT_TOKEN_LENGTH
79
+ ).to(device)
80
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
81
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
82
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
83
+ thread.start()
84
+ buffer = ""
85
+ for new_text in streamer:
86
+ buffer += new_text
87
+ time.sleep(0.01)
88
+ yield buffer, buffer
89
+
90
+ # Define examples for image inference
91
+ image_examples = [
92
+ ["According to this diagram, where do severe droughts occur?", "images/1.png"],
93
+ ["Could you describe this image?", "images/2.jpg"],
94
+ ["Provide a description of this image.", "images/3.jpg"],
95
+ ["Explain the movie shot in detail.", "images/4.png"],
96
+ ]
97
+
98
+ # Updated CSS with model choice highlighting
99
+ css = """
100
+ .submit-btn {
101
+ background-color: #2980b9 !important;
102
+ color: white !important;
103
+ }
104
+ .submit-btn:hover {
105
+ background-color: #3498db !important;
106
+ }
107
+ .canvas-output {
108
+ border: 2px solid #4682B4;
109
+ border-radius: 10px;
110
+ padding: 20px;
111
+ }
112
+ """
113
+
114
+ # Create the Gradio Interface
115
+ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
116
+ gr.Markdown("# **LFM2-VL by [LiquidAI](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)**")
117
+ with gr.Row():
118
+ with gr.Column():
119
+ image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
120
+ image_upload = gr.Image(type="pil", label="Image")
121
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
122
+ gr.Examples(
123
+ examples=image_examples,
124
+ inputs=[image_query, image_upload]
125
+ )
126
+
127
+ with gr.Accordion("Advanced options", open=False):
128
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
129
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
130
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
131
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
132
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
133
+
134
+ with gr.Column():
135
+ with gr.Column(elem_classes="canvas-output"):
136
+ gr.Markdown("## Output")
137
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
138
+ with gr.Accordion("(Result.md)", open=False):
139
+ markdown_output = gr.Markdown(label="(Result.md)")
140
+
141
+ model_choice = gr.Dropdown(
142
+ choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
143
+ label="Select Model",
144
+ value="LFM2-VL-1.6B"
145
+ )
146
+
147
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
148
+ gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
149
+ gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")
150
+
151
+ # Define the submit button action
152
+ image_submit.click(fn=generate_image,
153
+ inputs=[
154
+ model_choice, image_query, image_upload,
155
+ max_new_tokens, temperature, top_p, top_k,
156
+ repetition_penalty
157
+ ],
158
+ outputs=[output, markdown_output])
159
+
160
+ if __name__ == "__main__":
161
+ demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
images/1.png ADDED

Git LFS Details

  • SHA256: 443e28cba26ab4a08e2d4bcc311129c5818608ff8d4976c444bfcdd9918225ca
  • Pointer size: 131 Bytes
  • Size of remote file: 310 kB
images/2.jpg ADDED
images/3.jpg ADDED

Git LFS Details

  • SHA256: 0bb7318e890a7527f3c900531850d3f3b4786c6ae2c43939970e6884553e57ba
  • Pointer size: 131 Bytes
  • Size of remote file: 870 kB
images/4.png ADDED
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ av
2
+ peft
3
+ torch
4
+ spaces
5
+ gradio
6
+ pillow
7
+ requests
8
+ accelerate
9
+ safetensors
10
+ torchvision
11
+ transformers
12
+ huggingface_hub
13
+ opencv-python
14
+ sentencepiece
15
+ qwen-vl-utils
16
+ transformers-stream-generator