Update app.py
Browse files
app.py
CHANGED
|
@@ -33,7 +33,7 @@ from PIL import Image
|
|
| 33 |
from PyPDF2 import PdfFileReader
|
| 34 |
from pdf2image import convert_from_bytes
|
| 35 |
import pdfplumber
|
| 36 |
-
from line_cor import mark_region
|
| 37 |
import pdf2image
|
| 38 |
|
| 39 |
|
|
@@ -48,6 +48,42 @@ import pytesseract
|
|
| 48 |
|
| 49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 50 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
@st.experimental_singleton
|
| 52 |
def read_pdf(file):
|
| 53 |
images=pdf2image.convert_from_path(file)
|
|
|
|
| 33 |
from PyPDF2 import PdfFileReader
|
| 34 |
from pdf2image import convert_from_bytes
|
| 35 |
import pdfplumber
|
| 36 |
+
#from line_cor import mark_region
|
| 37 |
import pdf2image
|
| 38 |
|
| 39 |
|
|
|
|
| 48 |
|
| 49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
| 50 |
from PIL import Image
|
| 51 |
+
def mark_region(im):
|
| 52 |
+
|
| 53 |
+
#im = cv2.imread(image_path)
|
| 54 |
+
|
| 55 |
+
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
| 56 |
+
blur = cv2.GaussianBlur(gray, (9,9), 0)
|
| 57 |
+
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
|
| 58 |
+
|
| 59 |
+
# Dilate to combine adjacent text contours
|
| 60 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
|
| 61 |
+
dilate = cv2.dilate(thresh, kernel, iterations=4)
|
| 62 |
+
|
| 63 |
+
# Find contours, highlight text areas, and extract ROIs
|
| 64 |
+
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 65 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
| 66 |
+
|
| 67 |
+
line_items_coordinates = []
|
| 68 |
+
for c in cnts:
|
| 69 |
+
area = cv2.contourArea(c)
|
| 70 |
+
x,y,w,h = cv2.boundingRect(c)
|
| 71 |
+
|
| 72 |
+
if y >= 600 and x <= 1000:
|
| 73 |
+
if area > 10000:
|
| 74 |
+
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
|
| 75 |
+
line_items_coordinates.append([(x,y), (2200, y+h)])
|
| 76 |
+
|
| 77 |
+
if y >= 2400 and x<= 2000:
|
| 78 |
+
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
|
| 79 |
+
line_items_coordinates.append([(x,y), (2200, y+h)])
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
return image, line_items_coordinates
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
@st.experimental_singleton
|
| 88 |
def read_pdf(file):
|
| 89 |
images=pdf2image.convert_from_path(file)
|