Spaces:

pantatwiai
/

Newspapers-OCR-Demo

Sleeping

File size: 1,384 Bytes

1b870f4

import cv2
import numpy
import argparse
from pytesseract import*
from PIL import Image, ImageFont, ImageDraw
import numpy as np



# def preprocess_image(image):
    


def OCR(img, lang='hin', min_conf=0.25):
    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # preprocessed_image = preprocess_image(rgb)
    # write the preprocessed image to disk as a temporary file so we can
    results = pytesseract.image_to_data(rgb, output_type=Output.DICT, lang=lang)
    out_text = ""
    for i in range(0, len(results["text"])):
        
        # We can then extract the bounding box coordinates
        # of the text region from the current result
        x = results["left"][i]
        y = results["top"][i]
        w = results["width"][i]
        h = results["height"][i]
        
        # We will also extract the OCR text itself along
        # with the confidence of the text localization
        text = results["text"][i]
        conf = int(results["conf"][i])
        
        # filter out weak confidence text localizations
        if conf > min_conf:
            # We then strip out non-ASCII text so we can
            # draw the text on the image We will be using
            # OpenCV, then draw a bounding box around the
            # text along with the text itself
            text = "".join(text).strip()
            out_text += text + " "
            
    return out_text