Spaces:

pantatwiai
/

Newspapers-OCR-Demo

Runtime error

App Files Files Community

Devesh Pant commited on Dec 12, 2023

Commit

1b870f4

1 Parent(s): 589dd63

v0

Browse files

Files changed (39) hide show

README.md +3 -3
app.py +102 -0
best.onnx +3 -0
cropped/1.png +0 -0
cropped/10.png +0 -0
cropped/11.png +0 -0
cropped/12.png +0 -0
cropped/13.png +0 -0
cropped/14.png +0 -0
cropped/15.png +0 -0
cropped/16.png +0 -0
cropped/17.png +0 -0
cropped/18.png +0 -0
cropped/19.png +0 -0
cropped/2.png +0 -0
cropped/20.png +0 -0
cropped/21.png +0 -0
cropped/22.png +0 -0
cropped/23.png +0 -0
cropped/24.png +0 -0
cropped/25.png +0 -0
cropped/26.png +0 -0
cropped/27.png +0 -0
cropped/28.png +0 -0
cropped/29.png +0 -0
cropped/3.png +0 -0
cropped/4.png +0 -0
cropped/5.png +0 -0
cropped/6.png +0 -0
cropped/7.png +0 -0
cropped/8.png +0 -0
cropped/9.png +0 -0
main.py +52 -0
order_text_blocks.py +165 -0
packages.txt +2 -0
requirements.txt +13 -0
run_ocr.py +43 -0
run_yolo.py +248 -0
runtime.txt +1 -0

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: Newspapers OCR Demo
-emoji: 👀
-colorFrom: purple
 colorTo: green
 sdk: streamlit
 sdk_version: 1.29.0

 ---
+title: Newspaper Demo
+emoji: 🐨
+colorFrom: yellow
 colorTo: green
 sdk: streamlit
 sdk_version: 1.29.0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import cv2
+import numpy as np
+import streamlit as st
+from run_yolo import get_layout_results
+from order_text_blocks import get_ordered_data
+from run_ocr import OCR
+from main import driver
+import json
+import pandas as pd
+colors = {
+    'Articles': [0, 0, 0],        # Red
+    'Advertisement': [0, 255, 0],   # Green
+    'Headlines': [0, 0, 255],       # Blue
+    'Sub-headlines': [255, 255, 0],  # Yellow
+    'Graphics': [255, 0, 255],      # Magenta
+    'Images': [128, 0, 128],        # Purple
+    'Tables': [0, 255, 255],        # Teal
+    'Header': [0, 0, 0],             # Black
+    'Text Block': [255, 0, 0]
+}
+try:
+    st.set_page_config(layout="wide", page_title="Newspaper Layout Detection and OCR Demo")
+    st.markdown("<h1 style='text-align: center; color: #333;'>Newspaper Layout Detection and OCR Demo</h1>", unsafe_allow_html=True)
+    # Streamlit UI for user input
+    uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+    language_name = st.text_input("Enter the language name: (hin, en, mal, tel, tam, kan))")
+    submit_button = st.button("Submit")
+    # Check if the user clicked the submit button
+    if submit_button:
+        # Check if image and language are provided
+        if uploaded_image is not None and language_name:
+            # Convert Streamlit file uploader to OpenCV image
+            image_bytes = uploaded_image.read()
+            nparr = np.frombuffer(image_bytes, np.uint8)
+            img_ori = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+            img = img_ori.copy()
+            st.markdown("<p style='text-align: center; color: red'>Image Uploaded Successfully!</p>", unsafe_allow_html=True)
+            # Continue with the rest of the code...
+            output_dict, article_ocr_dict = driver(img_ori, language_name, st)
+        # Create a list to store dictionaries for OCR results
+        image_data = []
+        # Visualizing Results
+        itr = 1
+        for art_key in article_ocr_dict:
+            art_coords = art_key.split('_')
+            art_x1, art_y1, art_x2, art_y2 = int(art_coords[0]), int(art_coords[1]), int(art_coords[2]), int(art_coords[3])
+            # Mark the article bounding box with dark green color
+            img_ori = cv2.rectangle(img, (art_x1, art_y1), (art_x2, art_y2), (0, 0, 0), 4)
+            # Put the article number on the image in large font
+            img_ori = cv2.putText(img_ori, str(itr), (art_x1, art_y1), cv2.FONT_HERSHEY_SIMPLEX, 3, (0, 0, 255), 3, cv2.LINE_AA)
+            ocr_dict = article_ocr_dict[art_key]
+            # Initialize variables to store OCR text for each type
+            headlines_text = ''
+            subheadlines_text = ''
+            textblocks_text = ''
+            for obj_key in ocr_dict:
+                # obj_key is either of Headlines, Sub-headlines, Text Block
+                obj_list = ocr_dict[obj_key]
+                for obj_dict in obj_list:
+                    for key in obj_dict:
+                        coords = key.split('_')
+                        x1, y1, x2, y2 = int(coords[0]), int(coords[1]), int(coords[2]), int(coords[3])
+                        # Mark the bounding box with color corresponding to the object type
+                        img = cv2.rectangle(img, (x1, y1), (x2, y2), colors[obj_key], 2)
+                        # Add the OCR text to the corresponding variable
+                        if obj_key == 'Headlines':
+                            headlines_text += obj_dict[key] + '\n'
+                        elif obj_key == 'Sub-headlines':
+                            subheadlines_text += obj_dict[key] + '\n'
+                        elif obj_key == 'Text Block':
+                            textblocks_text += obj_dict[key] + '\n'
+            # Add a dictionary to the list for the current article
+            image_data.append({'Article': itr, 'Headlines': headlines_text, 'Subheadlines': subheadlines_text, 'Textblocks': textblocks_text})
+            itr += 1
+        # Create a DataFrame from the list of dictionaries
+        image_df = pd.DataFrame(image_data)
+        # Use Streamlit columns to display the image and results side by side
+        col1, col2 = st.columns(2)
+        # Display the image with marked bounding boxes in the left column
+        col1.image(img_ori, use_column_width=True, channels="BGR", caption="Image with Marked Bounding Boxes")
+        # Display the DataFrame for OCR results for the whole image in the right column
+        col2.table(image_df.set_index('Article').style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'center')]}]))
+except Exception as e:
+    st.exception(e)

best.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64d3fc9d769f8b215e17a4cdf067e47063c1a1b98668e97c6440e5e020c1988f
+size 146238371

cropped/1.png ADDED Viewed

cropped/10.png ADDED Viewed

cropped/11.png ADDED Viewed

cropped/12.png ADDED Viewed

cropped/13.png ADDED Viewed

cropped/14.png ADDED Viewed

cropped/15.png ADDED Viewed

cropped/16.png ADDED Viewed

cropped/17.png ADDED Viewed

cropped/18.png ADDED Viewed

cropped/19.png ADDED Viewed

cropped/2.png ADDED Viewed

cropped/20.png ADDED Viewed

cropped/21.png ADDED Viewed

cropped/22.png ADDED Viewed

cropped/23.png ADDED Viewed

cropped/24.png ADDED Viewed

cropped/25.png ADDED Viewed

cropped/26.png ADDED Viewed

cropped/27.png ADDED Viewed

cropped/28.png ADDED Viewed

cropped/29.png ADDED Viewed

cropped/3.png ADDED Viewed

cropped/4.png ADDED Viewed

cropped/5.png ADDED Viewed

cropped/6.png ADDED Viewed

cropped/7.png ADDED Viewed

cropped/8.png ADDED Viewed

cropped/9.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import cv2
+from run_yolo import get_layout_results
+from order_text_blocks import get_ordered_data
+from run_ocr import OCR
+from tqdm import tqdm
+import time
+def driver(img, language_name, st):
+    onnx_path = "./best.onnx"
+    img_ori = img.copy()
+    labels = get_layout_results(img_ori, onnx_path)
+    output_dict = get_ordered_data(labels, img)
+    st.markdown("<p style='text-align: center; color: red'>Layout Analysis Completed!</p>", unsafe_allow_html=True)
+    article_wise_ocr = {}
+    h, w = img.shape[:2]
+    with st.spinner('Performing OCR...'):
+        # Add your spinner message with custom CSS
+        for itr, article in tqdm(enumerate(output_dict['Articles'])):
+            ocr_dict = {}
+            article_key = ""
+            for key in article:
+                if article[key] == []:
+                        continue
+                if key == 'Articles':
+                    x1, y1, x2, y2 = int(article[key][0][0]), int(article[key][0][1]), int(article[key][0][2]), int(article[key][0][3])
+                    article_key =  '_'.join([str(x1), str(y1), str(x2), str(y2)])
+                if key == 'Headlines' or key == 'Sub-headlines' or key == 'Text Block':
+                    for coord in article[key]:
+                        x1, y1, x2, y2 = int(coord[0]), int(coord[1]), int(coord[2]), int(coord[3])
+                        # check if the coordinates are valid, w.r.t image dimensions, if not then skip
+                        if x1 < 0 or x2 < 0 or y1 < 0 or y2 < 0 or x1 > w or x2 > w or y1 > h or y2 > h:
+                            continue
+                        crop = img[int(coord[1]):int(coord[3]), int(coord[0]):int(coord[2])]
+                        output_text = OCR(crop, lang=language_name)
+                        box_key = "_".join([str(int(coord[0])), str(int(coord[1])), str(int(coord[2])), str(int(coord[3]))])
+                        if key not in ocr_dict:
+                            ocr_dict[key] = [{box_key: output_text}]
+                        else:
+                            ocr_dict[key].append({box_key: output_text})
+            article_wise_ocr[article_key] = ocr_dict
+        st.markdown("<p style='text-align: center; color: red'>OCR Completed!</p>", unsafe_allow_html=True)
+        return output_dict, article_wise_ocr

order_text_blocks.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import cv2
+import json
+import numpy as np
+class_id_to_name = {
+            0: "Articles",
+            1: "Advertisement",
+            2: "Headlines",
+            3: "Sub-headlines",
+            4: "Graphics",
+            5: "Images",
+            6: "Tables",
+            7: "Text Block",
+            8: "Header"
+        }
+def findIntersection(box1, box2):
+    """
+    args:
+        box1: [x, y, w, h]
+        box2: [x, y, w, h]
+    returns:
+        iou: float
+    """
+    x1, y1, w1, h1 = box1[0], box1[1], box1[2] - box1[0], box1[3] - box1[1]
+    x2, y2, w2, h2 = box2[0], box2[1], box2[2] - box2[0], box2[3] - box2[1]
+    xA = max(x1, x2)
+    yA = max(y1, y2)
+    xB = min(x1 + w1, x2 + w2)
+    yB = min(y1 + h1, y2 + h2)
+    # Calculate the intersection area
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    # divide by obj1 area instead of union
+    if w1 * h1 == 0:
+        return 0
+    else:
+        iou = interArea / (w1 * h1)
+    return iou
+def get_hierarchy(labels):
+    class_list = [[] for _ in range(len(class_id_to_name))]
+    object_dict = {}
+    article_list = []
+    for itr, label in enumerate(labels):
+        x1,y1,x2,y2 = label[0][0], label[0][1], label[0][2], label[0][3]
+        conf = label[1]
+        class_id = label[2]
+        class_list[int(class_id)].append([x1, y1, x2, y2])
+        obj_key = int(class_id) * 1000000 + len(class_list[int(class_id)]) - 1
+        object_dict[obj_key] = [0, [x1, y1, x2, y2], int(class_id)]
+    # For each article, find all the objects that belong to it
+    cou = 0
+    for article in class_list[0]:
+        article_dict = {'Articles': [], 'Headlines': [], 'Sub-headlines': [], 'Graphics': [], 'Images': [], 'Text Block': [], "Advertisement": [], "Tables": []}
+        for class_id in range(9):
+            IoUThresh = 0.70; max_article = None
+            for obj in class_list[class_id]:
+                IoU = findIntersection(obj, article)
+                if IoU > IoUThresh:
+                    key = class_id_to_name[class_id]
+                    article_dict[key].append(obj)
+                    obj_key = class_id * 1000000 + class_list[class_id].index(obj)
+                    if obj_key in object_dict:
+                        cou += 1
+                        object_dict[obj_key][0] = 1
+        article_list.append(article_dict)
+    return article_list, object_dict
+def textblock_ordering(textblocks, img):
+    coords = []
+    ori_coords = {}
+    for obj in textblocks:
+        x1, y1, x2, y2 = map(float, obj)
+        ori_coords[tuple([x1, y1, x2, y2])] = [x1, y1, x2, y2]
+        coords.append([x1, y1, x2, y2])
+    # sort the textblocks by x1
+    coords.sort(key = lambda x: x[0])
+    # create vertical buckets of horizontal pixelsize of 15% of the image width
+    buckets = []
+    bucket_size = int(0.15 * img.shape[1])
+    # put the textblocks in the buckets
+    for coord in coords:
+        if len(buckets) == 0:
+            buckets.append([coord])
+        else:
+            for bucket in buckets:
+                if abs(bucket[0][0] - coord[0]) < bucket_size:
+                    bucket.append(coord)
+                    break
+            else:
+                buckets.append([coord])
+    # sort each bucket by y1
+    for bucket in buckets:
+        bucket.sort(key = lambda x: x[1])
+    # visualize the buckets one by one each with a different color
+    # for bucket in buckets:
+    #     color = (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))
+    #     for coord in bucket:
+    #         img = cv2.rectangle(img, (coord[0], coord[1]), (coord[2], coord[3]), color, 5)
+    #         cv2.imshow('img', img)
+    # change bucket coords to original coords
+    for bucket in buckets:
+        for i in range(len(bucket)):
+            bucket[i] = ori_coords[tuple(bucket[i])]
+    # merge all the buckets into one list
+    buckets = [item for sublist in buckets for item in sublist]
+    return buckets
+def get_ordered_data(labels, img):
+    article_list, object_dict = get_hierarchy(labels)
+    for article in article_list:
+        sorted_buckets = textblock_ordering(article['Text Block'], img)
+        article['Text Block'] = sorted_buckets
+    # Dump the results in a json file
+    # Data structure:
+    # {Article1: {Headlines: [obj1, obj2, ...], Sub-headlines: [obj1, obj2, ...], ...}, Article2: {...}, ...}
+    json_dict = {}
+    json_dict['Articles'] = article_list
+    json_dict['Extra'] = []
+    # Add remaining objects to the json
+    for key in object_dict:
+        if object_dict[key][0] == 0:
+            print("Extra: ", key)
+            json_dict['Extra'].append({class_id_to_name[object_dict[key][2]]: [object_dict[key][1]]})
+    return json_dict
+# if __name__ == '__main__':
+#     label_path = '/Users/deveshpant/Work/WadhwaniAI/IDSP/eNewspaperPDFs/Language_wise/Results/pred/Hindi2/labels/_Jansatta-Delhi  15-11_5.txt'
+#     img_path = '/Users/deveshpant/Work/WadhwaniAI/IDSP/eNewspaperPDFs/Language_wise/Language_wise_imgs/Hindi/_Jansatta-Delhi  15-11_5.png'
+#     json_dict = get_ordered_data(label_path, img_path)
+#     # dump the json
+#     with open('json_dict.json', 'w') as f:
+#         json.dump(json_dict, f)
+#     # read the json
+#     with open('json_dict.json', 'r') as f:
+#         json_dict = json.load(f)
+    # visualize the results
+    img = cv2.imread(img_path)

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ tesseract-ocr-all

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+numpy==1.26.2
+onnxruntime==1.15.1
+onnxruntime_gpu==1.16.3
+opencv_contrib_python==4.8.1.78
+opencv_python==4.8.1.78
+pandas==2.1.4
+Pillow
+pytesseract==0.3.10
+Requests==2.31.0
+streamlit==1.24.0
+torch==2.0.1
+torchvision==0.15.2
+tqdm==4.65.0

run_ocr.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import cv2
+import numpy
+import argparse
+from pytesseract import*
+from PIL import Image, ImageFont, ImageDraw
+import numpy as np
+# def preprocess_image(image):
+def OCR(img, lang='hin', min_conf=0.25):
+    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    # preprocessed_image = preprocess_image(rgb)
+    # write the preprocessed image to disk as a temporary file so we can
+    results = pytesseract.image_to_data(rgb, output_type=Output.DICT, lang=lang)
+    out_text = ""
+    for i in range(0, len(results["text"])):
+        # We can then extract the bounding box coordinates
+        # of the text region from the current result
+        x = results["left"][i]
+        y = results["top"][i]
+        w = results["width"][i]
+        h = results["height"][i]
+        # We will also extract the OCR text itself along
+        # with the confidence of the text localization
+        text = results["text"][i]
+        conf = int(results["conf"][i])
+        # filter out weak confidence text localizations
+        if conf > min_conf:
+            # We then strip out non-ASCII text so we can
+            # draw the text on the image We will be using
+            # OpenCV, then draw a bounding box around the
+            # text along with the text itself
+            text = "".join(text).strip()
+            out_text += text + " "
+    return out_text

run_yolo.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import cv2
+import time
+import requests
+import random
+import numpy as np
+from PIL import Image
+from pathlib import Path
+from collections import OrderedDict,namedtuple
+import onnxruntime as ort
+import torch
+import torchvision
+import math
+def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
+    # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
+    box2 = box2.T
+    # Get the coordinates of bounding boxes
+    if x1y1x2y2:  # x1, y1, x2, y2 = box1
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+    else:  # transform from xywh to xyxy
+        b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+        b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+        b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+        b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+    # Intersection area
+    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
+            (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
+    # Union Area
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+    union = w1 * h1 + w2 * h2 - inter + eps
+    iou = inter / union
+    if GIoU or DIoU or CIoU:
+        cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)  # convex (smallest enclosing box) width
+        ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
+        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
+            c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
+            rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
+                    (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4  # center distance squared
+            if DIoU:
+                return iou - rho2 / c2  # DIoU
+            elif CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
+                v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / (h2 + eps)) - torch.atan(w1 / (h1 + eps)), 2)
+                with torch.no_grad():
+                    alpha = v / (v - iou + (1 + eps))
+                return iou - (rho2 / c2 + v * alpha)  # CIoU
+        else:  # GIoU https://arxiv.org/pdf/1902.09630.pdf
+            c_area = cw * ch + eps  # convex area
+            return iou - (c_area - union) / c_area  # GIoU
+    else:
+        return iou  # IoU
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
+                        labels=()):
+    """Runs Non-Maximum Suppression (NMS) on inference results
+    Returns:
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+    nc = prediction.shape[2] - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+    # Settings
+    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
+    max_det = 300  # maximum number of detections per image
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+    t = time.time()
+    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            l = labels[xi]
+            v = torch.zeros((len(l), nc + 5), device=x.device)
+            v[:, :4] = l[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+        # Compute conf
+        if nc == 1:
+            x[:, 5:] = x[:, 4:5] # for models with one class, cls_loss is 0 and cls_conf is always 0.5,
+                                 # so there is no need to multiplicate.
+        else:
+            x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 5:].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+        output[xi] = x[i]
+        if (time.time() - t) > time_limit:
+            print(f'WARNING: NMS time limit {time_limit}s exceeded')
+            break  # time limit exceeded
+    return output
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, r, (dw, dh)
+def get_layout_results(img, onnx_path):
+    providers = ['CPUExecutionProvider']
+    session = ort.InferenceSession(onnx_path, providers=providers)
+    names = ['Articles', 'Advertisement', 'Headlines', 'Sub-headlines', 'Graphics', 'Images', 'Tables', 'Text Block', 'Header']
+    # colors = {name:[random.randint(0, 255) for _ in range(3)] for i,name in enumerate(names)}
+    # instead of random color, use specific easily distinguishable colors for each class
+    colors = {
+        'Articles': [255, 0, 0],        # Red
+        'Advertisement': [0, 255, 0],   # Green
+        'Headlines': [0, 0, 255],       # Blue
+        'Sub-headlines': [255, 255, 0],  # Yellow
+        'Graphics': [255, 0, 255],      # Magenta
+        'Images': [128, 0, 128],        # Purple
+        'Tables': [0, 255, 255],        # Teal
+        'Text Block': [0, 128, 128],    # Navy
+        'Header': [0, 0, 0]             # Black
+    }
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    image = img.copy()
+    image, ratio, dwdh = letterbox(image, auto=False)
+    image = image.transpose((2, 0, 1))
+    image = np.expand_dims(image, 0)
+    image = np.ascontiguousarray(image)
+    im = image.astype(np.float32)
+    im /= 255.0
+    outname = [i.name for i in session.get_outputs()]
+    inname = [i.name for i in session.get_inputs()]
+    inp = {inname[0]:im}
+    # ONNX inference
+    outputs = session.run(outname, inp)[0]
+    # convert to torch tensor
+    outputs = torch.from_numpy(outputs)
+    det = non_max_suppression(outputs, 0.25, 0.45, classes=None, agnostic=False)[0] # conf_thres=0.25, iou_thres=0.45
+    results = []
+    # postprocess the output
+    for i,(x0,y0,x1,y1,score,cls_id) in enumerate(det):
+        box = np.array([x0,y0,x1,y1])
+        box -= np.array(dwdh*2)
+        box /= ratio
+        box = box.round().astype(np.int32).tolist()
+        cls_id = int(cls_id)
+        score = round(float(score),3)
+        name = names[cls_id]
+        color = colors[name]
+        results.append([box, score, cls_id, color])
+    return results
+if __name__ == '__main__':
+    onnx_path = "/home/ubuntu/devesh/yolov7/runs/train/yolov7-custom9/weights/best.onnx"
+    img_ori = cv2.imread('/home/ubuntu/devesh/yolov7/Language_wise_imgs/Hindi/_Dainik_Navajyoti_-_04-11-2023_3.png')
+    lines = get_layout_results(img_ori, onnx_path)
+    print(lines[0])

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.8.7