Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -16,7 +16,6 @@ import cv2
|
|
16 |
from transformers import (
|
17 |
Qwen2VLForConditionalGeneration,
|
18 |
Qwen2_5_VLForConditionalGeneration,
|
19 |
-
Gemma3nForConditionalGeneration,
|
20 |
AutoModelForImageTextToText,
|
21 |
AutoProcessor,
|
22 |
TextIteratorStreamer,
|
@@ -57,11 +56,12 @@ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
57 |
torch_dtype=torch.float16
|
58 |
).to(device).eval()
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
65 |
trust_remote_code=True,
|
66 |
torch_dtype=torch.float16
|
67 |
).to(device).eval()
|
@@ -103,12 +103,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
103 |
elif model_name == "DREX-062225-7B-exp":
|
104 |
processor = processor_x
|
105 |
model = model_x
|
106 |
-
elif model_name == "olmOCR-7B-0225-preview":
|
107 |
-
processor = processor_o
|
108 |
-
model = model_o
|
109 |
elif model_name == "Typhoon-OCR-3B":
|
110 |
processor = processor_t
|
111 |
model = model_t
|
|
|
|
|
|
|
112 |
else:
|
113 |
yield "Invalid model selected.", "Invalid model selected."
|
114 |
return
|
@@ -117,24 +117,27 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
117 |
yield "Please upload an image.", "Please upload an image."
|
118 |
return
|
119 |
|
120 |
-
messages = [{
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
}]
|
127 |
-
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
128 |
-
inputs = processor(
|
129 |
-
text=[prompt_full],
|
130 |
-
images=[image],
|
131 |
return_tensors="pt",
|
132 |
-
padding=True,
|
133 |
truncation=False,
|
134 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
135 |
).to(device)
|
136 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
137 |
-
generation_kwargs = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
139 |
thread.start()
|
140 |
buffer = ""
|
@@ -159,12 +162,12 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
159 |
elif model_name == "DREX-062225-7B-exp":
|
160 |
processor = processor_x
|
161 |
model = model_x
|
162 |
-
elif model_name == "olmOCR-7B-0225-preview":
|
163 |
-
processor = processor_o
|
164 |
-
model = model_o
|
165 |
elif model_name == "Typhoon-OCR-3B":
|
166 |
processor = processor_t
|
167 |
model = model_t
|
|
|
|
|
|
|
168 |
else:
|
169 |
yield "Invalid model selected.", "Invalid model selected."
|
170 |
return
|
@@ -174,14 +177,16 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
174 |
return
|
175 |
|
176 |
frames = downsample_video(video_path)
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
185 |
inputs = processor.apply_chat_template(
|
186 |
messages,
|
187 |
tokenize=True,
|
@@ -276,7 +281,7 @@ with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
|
|
276 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
277 |
|
278 |
model_choice = gr.Radio(
|
279 |
-
choices=["DREX-062225-7B-exp", "
|
280 |
label="Select Model",
|
281 |
value="DREX-062225-7B-exp"
|
282 |
)
|
@@ -285,7 +290,7 @@ with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
|
|
285 |
gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
|
286 |
gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
|
287 |
gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
|
288 |
-
gr.Markdown("> [
|
289 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
290 |
|
291 |
image_submit.click(
|
|
|
16 |
from transformers import (
|
17 |
Qwen2VLForConditionalGeneration,
|
18 |
Qwen2_5_VLForConditionalGeneration,
|
|
|
19 |
AutoModelForImageTextToText,
|
20 |
AutoProcessor,
|
21 |
TextIteratorStreamer,
|
|
|
56 |
torch_dtype=torch.float16
|
57 |
).to(device).eval()
|
58 |
|
59 |
+
|
60 |
+
# Load Gemma3n-E4B-it
|
61 |
+
MODEL_ID_G = "google/gemma-3n-E4B-it"
|
62 |
+
processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
|
63 |
+
model_g = AutoModelForImageTextToText.from_pretrained(
|
64 |
+
MODEL_ID_G,
|
65 |
trust_remote_code=True,
|
66 |
torch_dtype=torch.float16
|
67 |
).to(device).eval()
|
|
|
103 |
elif model_name == "DREX-062225-7B-exp":
|
104 |
processor = processor_x
|
105 |
model = model_x
|
|
|
|
|
|
|
106 |
elif model_name == "Typhoon-OCR-3B":
|
107 |
processor = processor_t
|
108 |
model = model_t
|
109 |
+
elif model_name == "Gemma3n-E4B-it":
|
110 |
+
processor = processor_g
|
111 |
+
model = model_g
|
112 |
else:
|
113 |
yield "Invalid model selected.", "Invalid model selected."
|
114 |
return
|
|
|
117 |
yield "Please upload an image.", "Please upload an image."
|
118 |
return
|
119 |
|
120 |
+
messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
|
121 |
+
inputs = processor.apply_chat_template(
|
122 |
+
messages,
|
123 |
+
tokenize=True,
|
124 |
+
add_generation_prompt=True,
|
125 |
+
return_dict=True,
|
|
|
|
|
|
|
|
|
|
|
126 |
return_tensors="pt",
|
|
|
127 |
truncation=False,
|
128 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
129 |
).to(device)
|
130 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
131 |
+
generation_kwargs = {
|
132 |
+
**inputs,
|
133 |
+
"streamer": streamer,
|
134 |
+
"max_new_tokens": max_new_tokens,
|
135 |
+
"do_sample": True,
|
136 |
+
"temperature": temperature,
|
137 |
+
"top_p": top_p,
|
138 |
+
"top_k": top_k,
|
139 |
+
"repetition_penalty": repetition_penalty,
|
140 |
+
}
|
141 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
142 |
thread.start()
|
143 |
buffer = ""
|
|
|
162 |
elif model_name == "DREX-062225-7B-exp":
|
163 |
processor = processor_x
|
164 |
model = model_x
|
|
|
|
|
|
|
165 |
elif model_name == "Typhoon-OCR-3B":
|
166 |
processor = processor_t
|
167 |
model = model_t
|
168 |
+
elif model_name == "Gemma3n-E4B-it":
|
169 |
+
processor = processor_g
|
170 |
+
model = model_g
|
171 |
else:
|
172 |
yield "Invalid model selected.", "Invalid model selected."
|
173 |
return
|
|
|
177 |
return
|
178 |
|
179 |
frames = downsample_video(video_path)
|
180 |
+
content = [{"type": "text", "text": text}]
|
181 |
+
if model_name == "Gemma3n-E4B-it":
|
182 |
+
for frame, _ in frames:
|
183 |
+
content.append({"type": "image", "image": frame})
|
184 |
+
else:
|
185 |
+
for frame in frames:
|
186 |
+
image, timestamp = frame
|
187 |
+
content.append({"type": "text", "text": f"Frame {timestamp}:"})
|
188 |
+
content.append({"type": "image", "image": image})
|
189 |
+
messages = [{"role": "user", "content": content}]
|
190 |
inputs = processor.apply_chat_template(
|
191 |
messages,
|
192 |
tokenize=True,
|
|
|
281 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
282 |
|
283 |
model_choice = gr.Radio(
|
284 |
+
choices=["DREX-062225-7B-exp", "VIREX-062225-7B-exp", "Typhoon-OCR-3B", "Gemma3n-E4B-it"],
|
285 |
label="Select Model",
|
286 |
value="DREX-062225-7B-exp"
|
287 |
)
|
|
|
290 |
gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
|
291 |
gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
|
292 |
gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
|
293 |
+
gr.Markdown("> [Gemma3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it): A multimodal model capable of processing images and videos for various tasks.")
|
294 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
295 |
|
296 |
image_submit.click(
|