Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

aad98bd

verified ·

1 Parent(s): cb3f55e

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -9

app.py CHANGED Viewed

@@ -16,7 +16,6 @@ import cv2
 from transformers import (
     Qwen2VLForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
     AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
@@ -28,12 +27,13 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load VIREX-062225-exp
 MODEL_ID_M = "prithivMLmods/VIREX-062225-exp"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
-model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
@@ -42,13 +42,13 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Load DREX-062225-exp
 MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Gemma3n-E4B-it
 MODEL_ID_G = "google/gemma-3n-E4B-it"
 processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
 model_g = AutoModelForImageTextToText.from_pretrained(
@@ -57,7 +57,7 @@ model_g = AutoModelForImageTextToText.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Gemma3n-E2B-it
 MODEL_ID_N = "google/gemma-3n-E2B-it"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
 model_n = AutoModelForImageTextToText.from_pretrained(
@@ -128,7 +128,7 @@ def generate_image(model_name: str, text: str, image_path: str,
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt",
-            truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
     else:
@@ -138,10 +138,16 @@ def generate_image(model_name: str, text: str, image_path: str,
             images=[image_path],
             return_tensors="pt",
             padding=True,
-            truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -153,6 +159,12 @@ def generate_image(model_name: str, text: str, image_path: str,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
@@ -205,7 +217,7 @@ def generate_video(model_name: str, text: str, video_path: str,
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt",
-            truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
     else:
@@ -216,10 +228,16 @@ def generate_video(model_name: str, text: str, video_path: str,
             images=images,
             return_tensors="pt",
             padding=True,
-            truncation=False,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -231,6 +249,12 @@ def generate_video(model_name: str, text: str, video_path: str,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""

 from transformers import (
     Qwen2VLForConditionalGeneration,
     AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# Determine device
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load VIREX-062225-exp
 MODEL_ID_M = "prithivMLmods/VIREX-062225-exp"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
 # Load DREX-062225-exp
 MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Gemma3n-E4B-it (Placeholder: Adjust model class if incorrect)
 MODEL_ID_G = "google/gemma-3n-E4B-it"
 processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
 model_g = AutoModelForImageTextToText.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Gemma3n-E2B-it (Placeholder: Adjust model class if incorrect)
 MODEL_ID_N = "google/gemma-3n-E2B-it"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
 model_n = AutoModelForImageTextToText.from_pretrained(
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt",
+            truncation=True,  # Enable truncation to prevent overflow
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
     else:
             images=[image_path],
             return_tensors="pt",
             padding=True,
+            truncation=True,  # Enable truncation to prevent overflow
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
+    # Check input token length
+    input_length = inputs["input_ids"].shape[1]
+    if input_length > MAX_INPUT_TOKEN_LENGTH:
+        yield f"Input too long. Max {MAX_INPUT_TOKEN_LENGTH} tokens. Got {input_length} tokens.", ""
+        return
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
+    # Ensure all tensors are on the correct device
+    for key in generation_kwargs:
+        if isinstance(generation_kwargs[key], torch.Tensor):
+            generation_kwargs[key] = generation_kwargs[key].to(device)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt",
+            truncation=True,  # Enable truncation to prevent overflow
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
     else:
             images=images,
             return_tensors="pt",
             padding=True,
+            truncation=True,  # Enable truncation to prevent overflow
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
+    # Check input token length
+    input_length = inputs["input_ids"].shape[1]
+    if input_length > MAX_INPUT_TOKEN_LENGTH:
+        yield f"Input too long. Max {MAX_INPUT_TOKEN_LENGTH} tokens. Got {input_length} tokens.", ""
+        return
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
+    # Ensure all tensors are on the correct device
+    for key in generation_kwargs:
+        if isinstance(generation_kwargs[key], torch.Tensor):
+            generation_kwargs[key] = generation_kwargs[key].to(device)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""