Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Soumen commited on Nov 25, 2022

Commit

f4332f9

1 Parent(s): 5aeb295

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -1

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ from PIL import Image
 from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_bytes
 import pdfplumber
-from line_cor import mark_region
 import pdf2image
@@ -48,6 +48,42 @@ import pytesseract
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
 def read_pdf(file):
     images=pdf2image.convert_from_path(file)

 from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_bytes
 import pdfplumber
+#from line_cor import mark_region
 import pdf2image
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
+def mark_region(im):
+    #im = cv2.imread(image_path)
+    gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
+    blur = cv2.GaussianBlur(gray, (9,9), 0)
+    thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
+    # Dilate to combine adjacent text contours
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
+    dilate = cv2.dilate(thresh, kernel, iterations=4)
+    # Find contours, highlight text areas, and extract ROIs
+    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    line_items_coordinates = []
+    for c in cnts:
+        area = cv2.contourArea(c)
+        x,y,w,h = cv2.boundingRect(c)
+        if y >= 600 and x <= 1000:
+            if area > 10000:
+                image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
+                line_items_coordinates.append([(x,y), (2200, y+h)])
+        if y >= 2400 and x<= 2000:
+            image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
+            line_items_coordinates.append([(x,y), (2200, y+h)])
+    return image, line_items_coordinates
 @st.experimental_singleton
 def read_pdf(file):
     images=pdf2image.convert_from_path(file)