Spaces:

MLBench
/

OCR_Term_Exctraction

Sleeping

App Files Files Community

ArslanRobo commited on Dec 6, 2024

Commit

fdb32ca

verified ·

1 Parent(s): 4f5259b

Upload 2 files

Browse files

Files changed (2) hide show

app.py +191 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# import os
+# import cv2
+# import re
+# import numpy as np
+# from PIL import Image, ImageDraw, ImageFont
+# from paddleocr import PaddleOCR
+# from pdf2image import convert_from_path
+# import gradio as gr
+# # Specify the path to the Poppler bin directory
+# poppler_path = r"C:\\poppler\\poppler-24.08.0\\Library\\bin"
+# # Function to check proximity of bounding boxes
+# def are_boxes_close(box1, box2, y_threshold=50):
+#     y1_center = (box1[0][1] + box1[2][1]) / 2
+#     y2_center = (box2[0][1] + box2[2][1]) / 2
+#     return abs(y1_center - y2_center) <= y_threshold
+# # Function to extract terms with specific rules
+# def extract_specific_terms(ocr_results):
+#     extracted_terms = []
+#     for line in ocr_results[0]:
+#         detected_text = line[1][0]  # Extracted text
+#         box = line[0]  # Bounding box of the detected text
+#         if re.match(r"Bill of Lading:\s*\d+", detected_text):
+#             extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})
+#         elif re.match(r"Page:\s*\w+", detected_text):
+#             extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})
+#         elif detected_text in ["Shipper", "Receiver", "Carrier"]:
+#             extracted_terms.append({'detected_text': detected_text + " Signature", 'bounding_box': box})
+#         elif detected_text == "Signature":
+#             extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})
+#     return extracted_terms
+# # Function to annotate image with detected terms
+# def annotate_image_with_terms(image, terms):
+#     pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+#     draw = ImageDraw.Draw(pil_image)
+#     font_size = 40
+#     try:
+#         font = ImageFont.truetype("arial.ttf", font_size)
+#     except IOError:
+#         font = ImageFont.load_default()
+#     for term in terms:
+#         box = term['bounding_box']
+#         detected_text = term['detected_text']
+#         points = [(int(x[0]), int(x[1])) for x in box]
+#         draw.polygon(points, outline="blue", width=2)
+#         position = (points[0][0], points[0][1] - font_size - 5)
+#         draw.text(position, detected_text, fill="red", font=font)
+#     return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+# # Main processing function
+# def process_file(file):
+#     ocr = PaddleOCR(lang='en')
+#     extracted_terms = []
+#     if file.name.endswith(".pdf"):
+#         images = convert_from_path(file.name, poppler_path=poppler_path)
+#         processed_images = []
+#         for image in images:
+#             image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+#             ocr_results = ocr.ocr(image_np, cls=True)
+#             extracted_terms = extract_specific_terms(ocr_results)
+#             annotated_image = annotate_image_with_terms(image_np, extracted_terms)
+#             processed_images.append(annotated_image)
+#         return [Image.fromarray(img) for img in processed_images]
+#     else:
+#         image = cv2.imread(file.name)
+#         ocr_results = ocr.ocr(image, cls=True)
+#         extracted_terms = extract_specific_terms(ocr_results)
+#         annotated_image = annotate_image_with_terms(image, extracted_terms)
+#         return Image.fromarray(annotated_image)
+# # Gradio Interface
+# def gradio_interface(file):
+#     result = process_file(file)
+#     if isinstance(result, list):
+#         return result[0]  # Display only the first page
+#     return result
+# iface = gr.Interface(
+#     fn=gradio_interface,
+#     inputs=gr.File(label="Upload an Image or PDF", file_types=[".pdf", ".png", ".jpg", ".jpeg"]),
+#     outputs="image",
+#     live=True,
+#     title="OCR Term Extraction",
+#     description="Upload an image or PDF containing text to detect and annotate terms such as 'Bill of Lading', 'Page', and signatures.",
+#     allow_flagging="never"
+# )
+# iface.launch()
+import os
+import cv2
+import re
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from paddleocr import PaddleOCR
+import gradio as gr
+# Function to check proximity of bounding boxes
+def are_boxes_close(box1, box2, y_threshold=50):
+    y1_center = (box1[0][1] + box1[2][1]) / 2
+    y2_center = (box2[0][1] + box2[2][1]) / 2
+    return abs(y1_center - y2_center) <= y_threshold
+# Function to extract terms with specific rules
+def extract_specific_terms(ocr_results):
+    extracted_terms = []
+    for line in ocr_results[0]:
+        detected_text = line[1][0]  # Extracted text
+        box = line[0]  # Bounding box of the detected text
+        if re.match(r"Bill of Lading:\s*\d+", detected_text):
+            extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})
+        elif re.match(r"Page:\s*\w+", detected_text):
+            extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})
+        elif detected_text in ["Shipper", "Receiver", "Carrier"]:
+            extracted_terms.append({'detected_text': detected_text + " Signature", 'bounding_box': box})
+        elif detected_text == "Signature":
+            extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})
+    return extracted_terms
+# Function to annotate image with detected terms
+def annotate_image_with_terms(image, terms):
+    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+    draw = ImageDraw.Draw(pil_image)
+    font_size = 20
+    try:
+        font = ImageFont.truetype("arial.ttf", font_size)
+    except IOError:
+        font = ImageFont.load_default()
+    for term in terms:
+        box = term['bounding_box']
+        detected_text = term['detected_text']
+        points = [(int(x[0]), int(x[1])) for x in box]
+        draw.polygon(points, outline="blue", width=2)
+        position = (points[0][0], points[0][1] - font_size - 5)
+        draw.text(position, detected_text, fill="red", font=font)
+    return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+# Main processing function
+def process_file(file):
+    ocr = PaddleOCR(lang='en')
+    extracted_terms = []
+    # Handle image files (PNG, JPG, JPEG)
+    image = cv2.imread(file.name)
+    ocr_results = ocr.ocr(image, cls=True)
+    extracted_terms = extract_specific_terms(ocr_results)
+    annotated_image = annotate_image_with_terms(image, extracted_terms)
+    return Image.fromarray(annotated_image)
+# Gradio Interface
+def gradio_interface(file):
+    result = process_file(file)
+    return result
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.File(label="Upload an Image", file_types=[".png", ".jpg", ".jpeg"]),
+    outputs="image",
+    live=True,
+    title="OCR Term Extraction",
+    description="Upload an image containing text to detect and annotate terms such as 'Bill of Lading', 'Page', and signatures.",
+    allow_flagging="never"
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+opencv-python
+numpy
+Pillow
+paddlepaddle
+# pdf2image
+gradio
+# poppler-utils