Spaces:

DeepDiveDev
/

TransformoDocs-Demo

Sleeping

DeepDiveDev commited on Feb 26

Commit

2653a83

verified ·

1 Parent(s): d1bb7e2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ model2 = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwri
 # Function to extract text using both models
 def extract_text(image):
     try:
-        # Ensure the input is a PIL image
         if isinstance(image, np.ndarray):
             if len(image.shape) == 2:  # Grayscale (H, W), convert to RGB
                 image = np.stack([image] * 3, axis=-1)
@@ -23,17 +23,17 @@ def extract_text(image):
         else:
             image = Image.open(image).convert("RGB")  # Ensure RGB mode
-        # Resize for better accuracy
-        image = image.resize((640, 640))
         # Process with the primary model
-        pixel_values = processor1(images=image, return_tensors="pt").pixel_values
         generated_ids = model1.generate(pixel_values)
         extracted_text = processor1.batch_decode(generated_ids, skip_special_tokens=True)[0]
         # If output seems incorrect, use the fallback model
         if len(extracted_text.strip()) < 2:
-            inputs = processor2(images=image, return_tensors="pt").pixel_values
             generated_ids = model2.generate(inputs)
             extracted_text = processor2.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -51,4 +51,4 @@ iface = gr.Interface(
     description="Upload a handwritten document and get the extracted text.",
 )
-iface.launch()

 # Function to extract text using both models
 def extract_text(image):
     try:
+        # Convert NumPy array to PIL Image if needed
         if isinstance(image, np.ndarray):
             if len(image.shape) == 2:  # Grayscale (H, W), convert to RGB
                 image = np.stack([image] * 3, axis=-1)
         else:
             image = Image.open(image).convert("RGB")  # Ensure RGB mode
+        # Maintain aspect ratio while resizing
+        image.thumbnail((640, 640))
         # Process with the primary model
+        pixel_values = processor1(images=image, return_tensors="pt").pixel_values.to(torch.float32)
         generated_ids = model1.generate(pixel_values)
         extracted_text = processor1.batch_decode(generated_ids, skip_special_tokens=True)[0]
         # If output seems incorrect, use the fallback model
         if len(extracted_text.strip()) < 2:
+            inputs = processor2(images=image, return_tensors="pt").pixel_values.to(torch.float32)
             generated_ids = model2.generate(inputs)
             extracted_text = processor2.batch_decode(generated_ids, skip_special_tokens=True)[0]
     description="Upload a handwritten document and get the extracted text.",
 )
+iface.launch()