Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on 5 days ago

Commit

e88e40e

verified ·

1 Parent(s): 30432e6

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -8

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load text-only model and tokenizer
-model_id = "prithivMLmods/Pocket-Llama2-3.2-3B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
@@ -40,8 +40,8 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-# Load multimodal processor and model (Callisto OCR3)
-MODEL_ID = "prithivMLmods/Callisto-OCR3-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -130,7 +130,7 @@ def generate(input_dict: dict, chat_history: list[dict],
     Generates chatbot responses with support for multimodal input, video processing,
     and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
     Special command:
-      - "@video-infer": triggers video processing using Callisto OCR3.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -191,7 +191,7 @@ def generate(input_dict: dict, chat_history: list[dict],
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing video with Callisto OCR3")
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
@@ -229,7 +229,7 @@ def generate(input_dict: dict, chat_history: list[dict],
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing image with Callisto OCR3")
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
@@ -259,7 +259,7 @@ def generate(input_dict: dict, chat_history: list[dict],
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
-        yield progress_bar_html("Processing With Pocket Llama 3B")
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
@@ -288,7 +288,7 @@ demo = gr.ChatInterface(
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}]
     ],
     cache_examples=False,
-    description="# **Callisto OCR**",
     type="messages",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load text-only model and tokenizer
+model_id = "prithivMLmods/Galactic-Qwen-14B-Exp2"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
 )
 model.eval()
+# Load multimodal processor and model
+MODEL_ID = "prithivMLmods/Imgscope-OCR-2B-0527"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     Generates chatbot responses with support for multimodal input, video processing,
     and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
     Special command:
+      - "@video-infer": triggers video processing using Imgscope-OCR
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing video with Imgscope-OCR")
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing image with Imgscope-OCR")
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
+        yield progress_bar_html("Processing With Galactic Qwen")
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}]
     ],
     cache_examples=False,
+    description="# **Imgscope-OCR**",
     type="messages",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),