prithivMLmods commited on
Commit
e88e40e
·
verified ·
1 Parent(s): 30432e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -31,7 +31,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
 
33
  # Load text-only model and tokenizer
34
- model_id = "prithivMLmods/Pocket-Llama2-3.2-3B-Instruct"
35
  tokenizer = AutoTokenizer.from_pretrained(model_id)
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_id,
@@ -40,8 +40,8 @@ model = AutoModelForCausalLM.from_pretrained(
40
  )
41
  model.eval()
42
 
43
- # Load multimodal processor and model (Callisto OCR3)
44
- MODEL_ID = "prithivMLmods/Callisto-OCR3-2B-Instruct"
45
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
46
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
47
  MODEL_ID,
@@ -130,7 +130,7 @@ def generate(input_dict: dict, chat_history: list[dict],
130
  Generates chatbot responses with support for multimodal input, video processing,
131
  and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
132
  Special command:
133
- - "@video-infer": triggers video processing using Callisto OCR3.
134
  """
135
  text = input_dict["text"]
136
  files = input_dict.get("files", [])
@@ -191,7 +191,7 @@ def generate(input_dict: dict, chat_history: list[dict],
191
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
192
  thread.start()
193
  buffer = ""
194
- yield progress_bar_html("Processing video with Callisto OCR3")
195
  for new_text in streamer:
196
  buffer += new_text
197
  buffer = buffer.replace("<|im_end|>", "")
@@ -229,7 +229,7 @@ def generate(input_dict: dict, chat_history: list[dict],
229
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
230
  thread.start()
231
  buffer = ""
232
- yield progress_bar_html("Processing image with Callisto OCR3")
233
  for new_text in streamer:
234
  buffer += new_text
235
  buffer = buffer.replace("<|im_end|>", "")
@@ -259,7 +259,7 @@ def generate(input_dict: dict, chat_history: list[dict],
259
  t = Thread(target=model.generate, kwargs=generation_kwargs)
260
  t.start()
261
  outputs = []
262
- yield progress_bar_html("Processing With Pocket Llama 3B")
263
  for new_text in streamer:
264
  outputs.append(new_text)
265
  yield "".join(outputs)
@@ -288,7 +288,7 @@ demo = gr.ChatInterface(
288
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}]
289
  ],
290
  cache_examples=False,
291
- description="# **Callisto OCR**",
292
  type="messages",
293
  fill_height=True,
294
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
 
33
  # Load text-only model and tokenizer
34
+ model_id = "prithivMLmods/Galactic-Qwen-14B-Exp2"
35
  tokenizer = AutoTokenizer.from_pretrained(model_id)
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_id,
 
40
  )
41
  model.eval()
42
 
43
+ # Load multimodal processor and model
44
+ MODEL_ID = "prithivMLmods/Imgscope-OCR-2B-0527"
45
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
46
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
47
  MODEL_ID,
 
130
  Generates chatbot responses with support for multimodal input, video processing,
131
  and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
132
  Special command:
133
+ - "@video-infer": triggers video processing using Imgscope-OCR
134
  """
135
  text = input_dict["text"]
136
  files = input_dict.get("files", [])
 
191
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
192
  thread.start()
193
  buffer = ""
194
+ yield progress_bar_html("Processing video with Imgscope-OCR")
195
  for new_text in streamer:
196
  buffer += new_text
197
  buffer = buffer.replace("<|im_end|>", "")
 
229
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
230
  thread.start()
231
  buffer = ""
232
+ yield progress_bar_html("Processing image with Imgscope-OCR")
233
  for new_text in streamer:
234
  buffer += new_text
235
  buffer = buffer.replace("<|im_end|>", "")
 
259
  t = Thread(target=model.generate, kwargs=generation_kwargs)
260
  t.start()
261
  outputs = []
262
+ yield progress_bar_html("Processing With Galactic Qwen")
263
  for new_text in streamer:
264
  outputs.append(new_text)
265
  yield "".join(outputs)
 
288
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}]
289
  ],
290
  cache_examples=False,
291
+ description="# **Imgscope-OCR**",
292
  type="messages",
293
  fill_height=True,
294
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),