Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 4, 2024

Commit

a4d1b09

verified ·

1 Parent(s): f311de3

Update ocr_extractor.py

Browse files

Files changed (1) hide show

ocr_extractor.py +88 -1

ocr_extractor.py CHANGED Viewed

@@ -49,4 +49,91 @@ def extract_text_doctr(image_path, doctr_model):
 def extract_text_easyocr(image_path, easyocr_reader):
     try:
         result = easyocr_reader.readtext(image_path)
-        return [(detection[1], detection[0], detection

 def extract_text_easyocr(image_path, easyocr_reader):
     try:
         result = easyocr_reader.readtext(image_path)
+        return [(detection[1], detection[0], detection[2]) for detection in result]
+    except Exception as e:
+        print(f"Error in EasyOCR: {str(e)}")
+        return []
+def extract_text_paddleocr(image_path, paddleocr_reader):
+    try:
+        result = paddleocr_reader.ocr(image_path, cls=True)
+        return [(line[1][0], line[0], line[1][1]) for line in result[0]]
+    except Exception as e:
+        print(f"Error in PaddleOCR: {str(e)}")
+        return []
+def bbox_to_polygon(bbox):
+    if isinstance(bbox, dict):  # AWS format
+        return Polygon([(bbox['Left'], bbox['Top']),
+                        (bbox['Left']+bbox['Width'], bbox['Top']),
+                        (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
+                        (bbox['Left'], bbox['Top']+bbox['Height'])])
+    elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox):  # EasyOCR format
+        return Polygon(bbox)
+    elif len(bbox) == 2:  # Doctr format
+        x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
+        return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
+    else:
+        raise ValueError(f"Unsupported bbox format: {bbox}")
+def combine_ocr_results(results, weights):
+    combined_words = []
+    for method, words in results.items():
+        for word, bbox, confidence in words:
+            try:
+                polygon = bbox_to_polygon(bbox)
+                combined_words.append((word, polygon, float(confidence) * weights[method]))
+            except Exception as e:
+                print(f"Error processing word '{word}' from {method}: {str(e)}")
+    final_words = []
+    while combined_words:
+        current_word = combined_words.pop(0)
+        overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
+        if overlapping:
+            best_word = max([current_word] + overlapping, key=lambda x: x[2])
+            final_words.append(best_word[0])
+            for word in overlapping:
+                combined_words.remove(word)
+        else:
+            final_words.append(current_word[0])
+    return ' '.join(final_words)
+def detect_language(text):
+    language, _ = langid.classify(text)
+    return language
+def process_file(file_path, weights_file):
+    _, file_extension = os.path.splitext(file_path)
+    if file_extension.lower() == '.pdf':
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text() + "\n"
+        return text
+    else:  # Assume it's an image file
+        with open(weights_file, 'r') as f:
+            weights = json.load(f)
+        with open(file_path, 'rb') as image_file:
+            image_bytes = image_file.read()
+        # Detect language using a sample of text from AWS Textract
+        aws_results = extract_text_aws(image_bytes)
+        sample_text = ' '.join([item[0] for item in aws_results[:10]])
+        detected_language = detect_language(sample_text)
+        doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
+        results = {
+            "aws": aws_results,
+            "doctr": extract_text_doctr(file_path, doctr_model),
+            "easyocr": extract_text_easyocr(file_path, easyocr_reader),
+            "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
+        }
+        return combine_ocr_results(results, weights)