Nassiraaa commited on
Commit
52a1b2f
·
verified ·
1 Parent(s): cbc2d14

Delete ocr_extractor.py

Browse files
Files changed (1) hide show
  1. ocr_extractor.py +0 -139
ocr_extractor.py DELETED
@@ -1,139 +0,0 @@
1
- import sys
2
- import importlib
3
- from PIL import Image
4
- import boto3
5
- import os
6
- from doctr.io import DocumentFile
7
- from doctr.models import ocr_predictor
8
- import easyocr
9
- from shapely.geometry import Polygon
10
- from paddleocr import PaddleOCR
11
- import langid
12
- import json
13
- import PyPDF2
14
-
15
- # Check if python-bidi is installed
16
- if importlib.util.find_spec("bidi") is None:
17
- print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
18
- sys.exit(1)
19
-
20
- # Initialize OCR models
21
- def load_models(language):
22
- doctr_model = ocr_predictor(pretrained=True)
23
- easyocr_reader = easyocr.Reader([language])
24
- paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
25
- return doctr_model, easyocr_reader, paddleocr_reader
26
-
27
- # AWS Textract client
28
- textract_client = boto3.client('textract', region_name='us-west-2')
29
-
30
- def extract_text_aws(image_bytes):
31
- try:
32
- response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
33
- return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
34
- for item in response['Blocks'] if item['BlockType'] == 'WORD']
35
- except Exception as e:
36
- print(f"Error in AWS Textract: {str(e)}")
37
- return []
38
-
39
- def extract_text_doctr(image_path, doctr_model):
40
- try:
41
- doc = DocumentFile.from_images(image_path)
42
- result = doctr_model(doc)
43
- return [(word.value, word.geometry, word.confidence)
44
- for block in result.pages[0].blocks for line in block.lines for word in line.words]
45
- except Exception as e:
46
- print(f"Error in Doctr OCR: {str(e)}")
47
- return []
48
-
49
- def extract_text_easyocr(image_path, easyocr_reader):
50
- try:
51
- result = easyocr_reader.readtext(image_path)
52
- return [(detection[1], detection[0], detection[2]) for detection in result]
53
- except Exception as e:
54
- print(f"Error in EasyOCR: {str(e)}")
55
- return []
56
-
57
- def extract_text_paddleocr(image_path, paddleocr_reader):
58
- try:
59
- result = paddleocr_reader.ocr(image_path, cls=True)
60
- return [(line[1][0], line[0], line[1][1]) for line in result[0]]
61
- except Exception as e:
62
- print(f"Error in PaddleOCR: {str(e)}")
63
- return []
64
-
65
- def bbox_to_polygon(bbox):
66
- if isinstance(bbox, dict): # AWS format
67
- return Polygon([(bbox['Left'], bbox['Top']),
68
- (bbox['Left']+bbox['Width'], bbox['Top']),
69
- (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
70
- (bbox['Left'], bbox['Top']+bbox['Height'])])
71
- elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox): # EasyOCR format
72
- return Polygon(bbox)
73
- elif len(bbox) == 2: # Doctr format
74
- x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
75
- return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
76
- else:
77
- raise ValueError(f"Unsupported bbox format: {bbox}")
78
-
79
- def combine_ocr_results(results, weights):
80
- combined_words = []
81
- for method, words in results.items():
82
- for word, bbox, confidence in words:
83
- try:
84
- polygon = bbox_to_polygon(bbox)
85
- combined_words.append((word, polygon, float(confidence) * weights[method]))
86
- except Exception as e:
87
- print(f"Error processing word '{word}' from {method}: {str(e)}")
88
-
89
- final_words = []
90
- while combined_words:
91
- current_word = combined_words.pop(0)
92
- overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
93
- if overlapping:
94
- best_word = max([current_word] + overlapping, key=lambda x: x[2])
95
- final_words.append(best_word[0])
96
- for word in overlapping:
97
- combined_words.remove(word)
98
- else:
99
- final_words.append(current_word[0])
100
-
101
- return ' '.join(final_words)
102
-
103
- def detect_language(text):
104
- language, _ = langid.classify(text)
105
- return language
106
-
107
- def process_file(file_path, weights_file):
108
- _, file_extension = os.path.splitext(file_path)
109
-
110
- if file_extension.lower() == '.pdf':
111
- with open(file_path, 'rb') as file:
112
- pdf_reader = PyPDF2.PdfReader(file)
113
- text = ""
114
- for page in pdf_reader.pages:
115
- text += page.extract_text() + "\n"
116
- return text
117
-
118
- else: # Assume it's an image file
119
- with open(weights_file, 'r') as f:
120
- weights = json.load(f)
121
-
122
- with open(file_path, 'rb') as image_file:
123
- image_bytes = image_file.read()
124
-
125
- # Detect language using a sample of text from AWS Textract
126
- aws_results = extract_text_aws(image_bytes)
127
- sample_text = ' '.join([item[0] for item in aws_results[:10]])
128
- detected_language = detect_language(sample_text)
129
-
130
- doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
131
-
132
- results = {
133
- "aws": aws_results,
134
- "doctr": extract_text_doctr(file_path, doctr_model),
135
- "easyocr": extract_text_easyocr(file_path, easyocr_reader),
136
- "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
137
- }
138
-
139
- return combine_ocr_results(results, weights)