Nassiraaa commited on
Commit
a4d1b09
·
verified ·
1 Parent(s): f311de3

Update ocr_extractor.py

Browse files
Files changed (1) hide show
  1. ocr_extractor.py +88 -1
ocr_extractor.py CHANGED
@@ -49,4 +49,91 @@ def extract_text_doctr(image_path, doctr_model):
49
  def extract_text_easyocr(image_path, easyocr_reader):
50
  try:
51
  result = easyocr_reader.readtext(image_path)
52
- return [(detection[1], detection[0], detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def extract_text_easyocr(image_path, easyocr_reader):
50
  try:
51
  result = easyocr_reader.readtext(image_path)
52
+ return [(detection[1], detection[0], detection[2]) for detection in result]
53
+ except Exception as e:
54
+ print(f"Error in EasyOCR: {str(e)}")
55
+ return []
56
+
57
+ def extract_text_paddleocr(image_path, paddleocr_reader):
58
+ try:
59
+ result = paddleocr_reader.ocr(image_path, cls=True)
60
+ return [(line[1][0], line[0], line[1][1]) for line in result[0]]
61
+ except Exception as e:
62
+ print(f"Error in PaddleOCR: {str(e)}")
63
+ return []
64
+
65
+ def bbox_to_polygon(bbox):
66
+ if isinstance(bbox, dict): # AWS format
67
+ return Polygon([(bbox['Left'], bbox['Top']),
68
+ (bbox['Left']+bbox['Width'], bbox['Top']),
69
+ (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
70
+ (bbox['Left'], bbox['Top']+bbox['Height'])])
71
+ elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox): # EasyOCR format
72
+ return Polygon(bbox)
73
+ elif len(bbox) == 2: # Doctr format
74
+ x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
75
+ return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
76
+ else:
77
+ raise ValueError(f"Unsupported bbox format: {bbox}")
78
+
79
+ def combine_ocr_results(results, weights):
80
+ combined_words = []
81
+ for method, words in results.items():
82
+ for word, bbox, confidence in words:
83
+ try:
84
+ polygon = bbox_to_polygon(bbox)
85
+ combined_words.append((word, polygon, float(confidence) * weights[method]))
86
+ except Exception as e:
87
+ print(f"Error processing word '{word}' from {method}: {str(e)}")
88
+
89
+ final_words = []
90
+ while combined_words:
91
+ current_word = combined_words.pop(0)
92
+ overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
93
+ if overlapping:
94
+ best_word = max([current_word] + overlapping, key=lambda x: x[2])
95
+ final_words.append(best_word[0])
96
+ for word in overlapping:
97
+ combined_words.remove(word)
98
+ else:
99
+ final_words.append(current_word[0])
100
+
101
+ return ' '.join(final_words)
102
+
103
+ def detect_language(text):
104
+ language, _ = langid.classify(text)
105
+ return language
106
+
107
+ def process_file(file_path, weights_file):
108
+ _, file_extension = os.path.splitext(file_path)
109
+
110
+ if file_extension.lower() == '.pdf':
111
+ with open(file_path, 'rb') as file:
112
+ pdf_reader = PyPDF2.PdfReader(file)
113
+ text = ""
114
+ for page in pdf_reader.pages:
115
+ text += page.extract_text() + "\n"
116
+ return text
117
+
118
+ else: # Assume it's an image file
119
+ with open(weights_file, 'r') as f:
120
+ weights = json.load(f)
121
+
122
+ with open(file_path, 'rb') as image_file:
123
+ image_bytes = image_file.read()
124
+
125
+ # Detect language using a sample of text from AWS Textract
126
+ aws_results = extract_text_aws(image_bytes)
127
+ sample_text = ' '.join([item[0] for item in aws_results[:10]])
128
+ detected_language = detect_language(sample_text)
129
+
130
+ doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
131
+
132
+ results = {
133
+ "aws": aws_results,
134
+ "doctr": extract_text_doctr(file_path, doctr_model),
135
+ "easyocr": extract_text_easyocr(file_path, easyocr_reader),
136
+ "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
137
+ }
138
+
139
+ return combine_ocr_results(results, weights)