prithivMLmods commited on
Commit
f5d475f
·
verified ·
1 Parent(s): 1e727a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -35
app.py CHANGED
@@ -16,7 +16,6 @@ import cv2
16
  from transformers import (
17
  Qwen2VLForConditionalGeneration,
18
  Qwen2_5_VLForConditionalGeneration,
19
- Gemma3nForConditionalGeneration,
20
  AutoModelForImageTextToText,
21
  AutoProcessor,
22
  TextIteratorStreamer,
@@ -57,11 +56,12 @@ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
  torch_dtype=torch.float16
58
  ).to(device).eval()
59
 
60
- # Load olmOCR-7B-0225-preview
61
- MODEL_ID_O = "google/gemma-3n-E4B-it"
62
- processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
63
- model_o = AutoModelForImageTextToText.from_pretrained(
64
- MODEL_ID_O,
 
65
  trust_remote_code=True,
66
  torch_dtype=torch.float16
67
  ).to(device).eval()
@@ -103,12 +103,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
103
  elif model_name == "DREX-062225-7B-exp":
104
  processor = processor_x
105
  model = model_x
106
- elif model_name == "olmOCR-7B-0225-preview":
107
- processor = processor_o
108
- model = model_o
109
  elif model_name == "Typhoon-OCR-3B":
110
  processor = processor_t
111
  model = model_t
 
 
 
112
  else:
113
  yield "Invalid model selected.", "Invalid model selected."
114
  return
@@ -117,24 +117,27 @@ def generate_image(model_name: str, text: str, image: Image.Image,
117
  yield "Please upload an image.", "Please upload an image."
118
  return
119
 
120
- messages = [{
121
- "role": "user",
122
- "content": [
123
- {"type": "image", "image": image},
124
- {"type": "text", "text": text},
125
- ]
126
- }]
127
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
128
- inputs = processor(
129
- text=[prompt_full],
130
- images=[image],
131
  return_tensors="pt",
132
- padding=True,
133
  truncation=False,
134
  max_length=MAX_INPUT_TOKEN_LENGTH
135
  ).to(device)
136
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
137
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
138
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
139
  thread.start()
140
  buffer = ""
@@ -159,12 +162,12 @@ def generate_video(model_name: str, text: str, video_path: str,
159
  elif model_name == "DREX-062225-7B-exp":
160
  processor = processor_x
161
  model = model_x
162
- elif model_name == "olmOCR-7B-0225-preview":
163
- processor = processor_o
164
- model = model_o
165
  elif model_name == "Typhoon-OCR-3B":
166
  processor = processor_t
167
  model = model_t
 
 
 
168
  else:
169
  yield "Invalid model selected.", "Invalid model selected."
170
  return
@@ -174,14 +177,16 @@ def generate_video(model_name: str, text: str, video_path: str,
174
  return
175
 
176
  frames = downsample_video(video_path)
177
- messages = [
178
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
179
- {"role": "user", "content": [{"type": "text", "text": text}]}
180
- ]
181
- for frame in frames:
182
- image, timestamp = frame
183
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
184
- messages[1]["content"].append({"type": "image", "image": image})
 
 
185
  inputs = processor.apply_chat_template(
186
  messages,
187
  tokenize=True,
@@ -276,7 +281,7 @@ with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
276
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
277
 
278
  model_choice = gr.Radio(
279
- choices=["DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "VIREX-062225-7B-exp", "Typhoon-OCR-3B"],
280
  label="Select Model",
281
  value="DREX-062225-7B-exp"
282
  )
@@ -285,7 +290,7 @@ with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
285
  gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
286
  gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
287
  gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
288
- gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
289
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
290
 
291
  image_submit.click(
 
16
  from transformers import (
17
  Qwen2VLForConditionalGeneration,
18
  Qwen2_5_VLForConditionalGeneration,
 
19
  AutoModelForImageTextToText,
20
  AutoProcessor,
21
  TextIteratorStreamer,
 
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
+
60
+ # Load Gemma3n-E4B-it
61
+ MODEL_ID_G = "google/gemma-3n-E4B-it"
62
+ processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
63
+ model_g = AutoModelForImageTextToText.from_pretrained(
64
+ MODEL_ID_G,
65
  trust_remote_code=True,
66
  torch_dtype=torch.float16
67
  ).to(device).eval()
 
103
  elif model_name == "DREX-062225-7B-exp":
104
  processor = processor_x
105
  model = model_x
 
 
 
106
  elif model_name == "Typhoon-OCR-3B":
107
  processor = processor_t
108
  model = model_t
109
+ elif model_name == "Gemma3n-E4B-it":
110
+ processor = processor_g
111
+ model = model_g
112
  else:
113
  yield "Invalid model selected.", "Invalid model selected."
114
  return
 
117
  yield "Please upload an image.", "Please upload an image."
118
  return
119
 
120
+ messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
121
+ inputs = processor.apply_chat_template(
122
+ messages,
123
+ tokenize=True,
124
+ add_generation_prompt=True,
125
+ return_dict=True,
 
 
 
 
 
126
  return_tensors="pt",
 
127
  truncation=False,
128
  max_length=MAX_INPUT_TOKEN_LENGTH
129
  ).to(device)
130
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
131
+ generation_kwargs = {
132
+ **inputs,
133
+ "streamer": streamer,
134
+ "max_new_tokens": max_new_tokens,
135
+ "do_sample": True,
136
+ "temperature": temperature,
137
+ "top_p": top_p,
138
+ "top_k": top_k,
139
+ "repetition_penalty": repetition_penalty,
140
+ }
141
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
142
  thread.start()
143
  buffer = ""
 
162
  elif model_name == "DREX-062225-7B-exp":
163
  processor = processor_x
164
  model = model_x
 
 
 
165
  elif model_name == "Typhoon-OCR-3B":
166
  processor = processor_t
167
  model = model_t
168
+ elif model_name == "Gemma3n-E4B-it":
169
+ processor = processor_g
170
+ model = model_g
171
  else:
172
  yield "Invalid model selected.", "Invalid model selected."
173
  return
 
177
  return
178
 
179
  frames = downsample_video(video_path)
180
+ content = [{"type": "text", "text": text}]
181
+ if model_name == "Gemma3n-E4B-it":
182
+ for frame, _ in frames:
183
+ content.append({"type": "image", "image": frame})
184
+ else:
185
+ for frame in frames:
186
+ image, timestamp = frame
187
+ content.append({"type": "text", "text": f"Frame {timestamp}:"})
188
+ content.append({"type": "image", "image": image})
189
+ messages = [{"role": "user", "content": content}]
190
  inputs = processor.apply_chat_template(
191
  messages,
192
  tokenize=True,
 
281
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
282
 
283
  model_choice = gr.Radio(
284
+ choices=["DREX-062225-7B-exp", "VIREX-062225-7B-exp", "Typhoon-OCR-3B", "Gemma3n-E4B-it"],
285
  label="Select Model",
286
  value="DREX-062225-7B-exp"
287
  )
 
290
  gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
291
  gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
292
  gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
293
+ gr.Markdown("> [Gemma3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it): A multimodal model capable of processing images and videos for various tasks.")
294
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
295
 
296
  image_submit.click(