prithivMLmods commited on
Commit
acfc9dc
·
verified ·
1 Parent(s): 147a2d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -14
app.py CHANGED
@@ -16,8 +16,7 @@ import cv2
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
  Qwen2VLForConditionalGeneration,
19
- Gemma3ForConditionalGeneration,
20
- LlavaForConditionalGeneration,
21
  AutoProcessor,
22
  TextIteratorStreamer,
23
  )
@@ -30,10 +29,10 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
 
33
- # Load gemma-3-4b-it
34
- MODEL_ID_M = "google/gemma-3-4b-it"
35
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
36
- model_m = Gemma3ForConditionalGeneration.from_pretrained(
37
  MODEL_ID_M,
38
  trust_remote_code=True,
39
  torch_dtype=torch.float16
@@ -49,14 +48,25 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
49
  ).to(device).eval()
50
 
51
  # Load Relaxed
52
- MODEL_ID_Z = "fancyfeast/llama-joycaption-beta-one-hf-llava"
53
  processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
54
- model_z = LlavaForConditionalGeneration.from_pretrained(
55
  MODEL_ID_Z,
56
  trust_remote_code=True,
57
  torch_dtype=torch.float16
58
  ).to(device).eval()
59
 
 
 
 
 
 
 
 
 
 
 
 
60
  def downsample_video(video_path):
61
  """
62
  Downsamples the video to evenly spaced frames.
@@ -88,15 +98,18 @@ def generate_image(model_name: str, text: str, image: Image.Image,
88
  """
89
  Generates responses using the selected model for image input.
90
  """
91
- if model_name == "gemma-3-4b-it":
92
  processor = processor_m
93
  model = model_m
94
  elif model_name == "coreOCR-7B-050325-preview":
95
  processor = processor_x
96
  model = model_x
97
- elif model_name == "llama-joycaption":
98
  processor = processor_z
99
  model = model_z
 
 
 
100
  else:
101
  yield "Invalid model selected."
102
  return
@@ -141,15 +154,18 @@ def generate_video(model_name: str, text: str, video_path: str,
141
  """
142
  Generates responses using the selected model for video input.
143
  """
144
- if model_name == "gemma-3-4b-it":
145
  processor = processor_m
146
  model = model_m
147
  elif model_name == "coreOCR-7B-050325-preview":
148
  processor = processor_x
149
  model = model_x
150
- elif model_name == "llama-joycaption":
151
  processor = processor_z
152
  model = model_z
 
 
 
153
  else:
154
  yield "Invalid model selected."
155
  return
@@ -247,9 +263,9 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
247
  with gr.Column():
248
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
249
  model_choice = gr.Radio(
250
- choices=["gemma-3-4b-it", "coreOCR-7B-050325-preview", "llama-joycaption"],
251
  label="Select Model",
252
- value="gemma-3-4b-it"
253
  )
254
 
255
  image_submit.click(
@@ -264,4 +280,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
264
  )
265
 
266
  if __name__ == "__main__":
267
- demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
  Qwen2VLForConditionalGeneration,
19
+ Llama4ForConditionalGeneration,
 
20
  AutoProcessor,
21
  TextIteratorStreamer,
22
  )
 
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
+ # Load meta-llama/Llama-Guard-4-12B
33
+ MODEL_ID_M = "meta-llama/Llama-Guard-4-12B"
34
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
35
+ model_m = Llama4ForConditionalGeneration.from_pretrained(
36
  MODEL_ID_M,
37
  trust_remote_code=True,
38
  torch_dtype=torch.float16
 
48
  ).to(device).eval()
49
 
50
  # Load Relaxed
51
+ MODEL_ID_Z = "Qwen/Qwen2.5-VL-3B-Instruct"
52
  processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
53
+ model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
  MODEL_ID_Z,
55
  trust_remote_code=True,
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
+ # Load ImageScope
60
+ MODEL_ID_T = "prithivMLmods/Imgscope-OCR-2B-0527"
61
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
62
+ model_t = Qwen2VLForConditionalGeneration.from_pretrained(
63
+ MODEL_ID_T,
64
+ trust_remote_code=True,
65
+ torch_dtype=torch.float16
66
+ ).to(device).eval()
67
+
68
+
69
+
70
  def downsample_video(video_path):
71
  """
72
  Downsamples the video to evenly spaced frames.
 
98
  """
99
  Generates responses using the selected model for image input.
100
  """
101
+ if model_name == "Llama-4":
102
  processor = processor_m
103
  model = model_m
104
  elif model_name == "coreOCR-7B-050325-preview":
105
  processor = processor_x
106
  model = model_x
107
+ elif model_name == "Qwen2.5-VL-3B":
108
  processor = processor_z
109
  model = model_z
110
+ elif model_name == "Imgscope-OCR-2B":
111
+ processor = processor_t
112
+ model = model_t
113
  else:
114
  yield "Invalid model selected."
115
  return
 
154
  """
155
  Generates responses using the selected model for video input.
156
  """
157
+ if model_name == "Llama-4":
158
  processor = processor_m
159
  model = model_m
160
  elif model_name == "coreOCR-7B-050325-preview":
161
  processor = processor_x
162
  model = model_x
163
+ elif model_name == "Qwen2.5-VL-3B":
164
  processor = processor_z
165
  model = model_z
166
+ elif model_name == "Imgscope-OCR-2B":
167
+ processor = processor_t
168
+ model = model_t
169
  else:
170
  yield "Invalid model selected."
171
  return
 
263
  with gr.Column():
264
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
265
  model_choice = gr.Radio(
266
+ choices=["Llama-4", "coreOCR-7B-050325-preview", "Imgscope-OCR-2B", "Qwen2.5-VL-3B"],
267
  label="Select Model",
268
+ value="Llama-4"
269
  )
270
 
271
  image_submit.click(
 
280
  )
281
 
282
  if __name__ == "__main__":
283
+ demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)