Nassiraaa commited on
Commit
6f9afdd
·
verified ·
1 Parent(s): 34a8c90

Delete ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +0 -114
ocr_utils.py DELETED
@@ -1,114 +0,0 @@
1
- import sys
2
- import importlib
3
- from PIL import Image
4
- import boto3
5
- from doctr.io import DocumentFile
6
- from doctr.models import ocr_predictor
7
- import easyocr
8
- from shapely.geometry import Polygon
9
- from paddleocr import PaddleOCR
10
- import langid
11
- import numpy as np
12
- import logging
13
-
14
- # Configure logging
15
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
-
17
- # Check if python-bidi is installed
18
- if importlib.util.find_spec("bidi") is None:
19
- logging.error("Error: python-bidi is not installed. Please add it to requirements.txt")
20
- sys.exit(1)
21
-
22
- # AWS Textract client
23
- textract_client = boto3.client('textract', region_name='us-west-2')
24
-
25
- def load_models(language):
26
- try:
27
- doctr_model = ocr_predictor(pretrained=True)
28
- easyocr_reader = easyocr.Reader([language])
29
- paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
30
- return doctr_model, easyocr_reader, paddleocr_reader
31
- except Exception as e:
32
- logging.error(f"Error loading models: {str(e)}")
33
- return None, None, None
34
-
35
- def extract_text_aws(file_content):
36
- try:
37
- response = textract_client.detect_document_text(Document={'Bytes': file_content})
38
- return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
39
- for item in response['Blocks'] if item['BlockType'] == 'WORD']
40
- except Exception as e:
41
- logging.error(f"Error in AWS Textract: {str(e)}")
42
- return []
43
-
44
- def extract_text_doctr(image, model):
45
- try:
46
- doc = DocumentFile.from_images([image])
47
- result = model(doc)
48
- return [(word.value, word.geometry, word.confidence)
49
- for block in result.pages[0].blocks for line in block.lines for word in line.words]
50
- except Exception as e:
51
- logging.error(f"Error in Doctr OCR: {str(e)}")
52
- return []
53
-
54
- def extract_text_easyocr(image, reader):
55
- try:
56
- result = reader.readtext(np.array(image))
57
- return [(detection[1], detection[0], detection[2]) for detection in result]
58
- except Exception as e:
59
- logging.error(f"Error in EasyOCR: {str(e)}")
60
- return []
61
-
62
- def extract_text_paddleocr(image, reader):
63
- try:
64
- result = reader.ocr(image, cls=True)
65
- return [(line[1][0], line[0], line[1][1]) for line in result[0]]
66
- except Exception as e:
67
- logging.error(f"Error in PaddleOCR: {str(e)}")
68
- return []
69
-
70
- def bbox_to_polygon(bbox):
71
- if isinstance(bbox, dict): # Format AWS
72
- return Polygon([(bbox['Left'], bbox['Top']),
73
- (bbox['Left']+bbox['Width'], bbox['Top']),
74
- (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
75
- (bbox['Left'], bbox['Top']+bbox['Height'])])
76
- elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox): # Format EasyOCR
77
- return Polygon(bbox)
78
- elif len(bbox) == 2: # Format Doctr
79
- x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
80
- return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
81
- else:
82
- raise ValueError(f"Unsupported bbox format: {bbox}")
83
-
84
- def combine_ocr_results(results, weights):
85
- combined_words = []
86
- for method, words in results.items():
87
- for word, bbox, confidence in words:
88
- try:
89
- polygon = bbox_to_polygon(bbox)
90
- combined_words.append((word, polygon, float(confidence) * weights[method]))
91
- except Exception as e:
92
- logging.error(f"Error processing word '{word}' from {method}: {str(e)}")
93
-
94
- final_words = []
95
- while combined_words:
96
- current_word = combined_words.pop(0)
97
- overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
98
- if overlapping:
99
- best_word = max([current_word] + overlapping, key=lambda x: x[2])
100
- final_words.append(best_word[0])
101
- for word in overlapping:
102
- combined_words.remove(word)
103
- else:
104
- final_words.append(current_word[0])
105
-
106
- return ' '.join(final_words)
107
-
108
- def detect_language(text):
109
- try:
110
- lang, _ = langid.classify(text)
111
- return lang
112
- except Exception as e:
113
- logging.error(f"Error in language detection: {str(e)}")
114
- return 'en' # Default to English