Nassiraaa commited on
Commit
19f2047
·
verified ·
1 Parent(s): 09acf34

Create yolo_text_extraction.py

Browse files
Files changed (1) hide show
  1. yolo_text_extraction.py +98 -0
yolo_text_extraction.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ultralytics import YOLO
2
+ from PIL import Image,ImageDraw
3
+ import numpy as np
4
+ from PIL import ImageFilter
5
+
6
+ from dotenv import load_dotenv
7
+
8
+ import numpy as np
9
+ from ocr_functions import paddle_ocr,textract_ocr,tesseract_ocr
10
+ from pdf2image import convert_from_path
11
+
12
+
13
+ model =YOLO("yolo_model/best.pt")
14
+
15
+
16
+ def check_intersection(bbox1, bbox2):
17
+ # Check for intersection between two bounding boxes
18
+ x1, y1, x2, y2 = bbox1
19
+ x3, y3, x4, y4 = bbox2
20
+ return not (x3 > x2 or x4 < x1 or y3 > y2 or y4 < y1)
21
+
22
+ def check_inclusion(bbox1, bbox2):
23
+ # Check if one bounding box is completely inside another
24
+ x1, y1, x2, y2 = bbox1
25
+ x3, y3, x4, y4 = bbox2
26
+ return x1 >= x3 and y1 >= y3 and x2 <= x4 and y2 <= y4
27
+
28
+ def union_bbox(bbox1, bbox2):
29
+ # Calculate the union of two bounding boxes
30
+ x1 = min(bbox1[0], bbox2[0])
31
+ y1 = min(bbox1[1], bbox2[1])
32
+ x2 = max(bbox1[2], bbox2[2])
33
+ y2 = max(bbox1[3], bbox2[3])
34
+ return [x1, y1, x2, y2]
35
+
36
+ def filter_bboxes(bboxes):
37
+ # Iterate through each pair of bounding boxes and filter out those that intersect or are completely contained within another
38
+ filtered_bboxes = []
39
+ for bbox1 in bboxes:
40
+ is_valid = True
41
+ for bbox2 in filtered_bboxes:
42
+ if check_intersection(bbox1, bbox2):
43
+ # If the two bounding boxes intersect, compute their union
44
+ bbox1 = union_bbox(bbox1, bbox2)
45
+ # Mark the current bbox as invalid to be removed
46
+ is_valid = False
47
+ break
48
+ elif check_inclusion(bbox1, bbox2):
49
+ # If bbox1 is completely contained within bbox2, mark bbox1 as invalid to be removed
50
+ is_valid = False
51
+ break
52
+ if is_valid:
53
+ filtered_bboxes.append(bbox1)
54
+ return filtered_bboxes
55
+
56
+
57
+
58
+
59
+ def draw_bboxes(image, bboxes ):
60
+ draw = ImageDraw.Draw(image)
61
+ for bbox in bboxes:
62
+ x1, y1, x2, y2 = bbox
63
+
64
+ x1,y1,x2,y2 = int(x1),int(y1),int(x2),int(y2)
65
+ draw.rectangle([(x1, y1), (x2, y2)], outline=(255, 0, 0), width=2)
66
+
67
+
68
+
69
+ def extract_image(image,box):
70
+ x1, y1, x2, y2 = box
71
+ cropped_image = image.crop((x1, y1, x2, y2))
72
+
73
+
74
+ def text_image(image):
75
+ image = image.convert("RGB")
76
+ image = image.filter(ImageFilter.MedianFilter(3))
77
+ image_np = np.array(image)
78
+ result = model.predict(source=image_np, conf=0.10, save=False)
79
+ names = result[0].names
80
+ data = result[0].boxes.data.numpy()
81
+ xyxy = data[:, :]
82
+ bboxes = data[:, 0:4].tolist()
83
+ bboxes_filter = filter_bboxes(bboxes)
84
+ image_box = data[data[:, 5] == 11]
85
+ extract_image(image, image_box[0, 0:4])
86
+ draw_bboxes(image, bboxes_filter)
87
+ image.save("output.png")
88
+ texts = [textract_ocr(image, bbox) for bbox in bboxes_filter]
89
+ return "\n------section-------\n"+"\n------section-------\n".join(texts)
90
+
91
+
92
+
93
+ def pdf_to_text(pdf_file):
94
+ text = ""
95
+ images = convert_from_path(pdf_file)
96
+ for image in images :
97
+ text = text + text_image(image) + "\n"
98
+ return text