Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 3, 2024

Commit

6f9afdd

verified ·

1 Parent(s): 34a8c90

Delete ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +0 -114

ocr_utils.py DELETED Viewed

@@ -1,114 +0,0 @@
-import sys
-import importlib
-from PIL import Image
-import boto3
-from doctr.io import DocumentFile
-from doctr.models import ocr_predictor
-import easyocr
-from shapely.geometry import Polygon
-from paddleocr import PaddleOCR
-import langid
-import numpy as np
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Check if python-bidi is installed
-if importlib.util.find_spec("bidi") is None:
-    logging.error("Error: python-bidi is not installed. Please add it to requirements.txt")
-    sys.exit(1)
-# AWS Textract client
-textract_client = boto3.client('textract', region_name='us-west-2')
-def load_models(language):
-    try:
-        doctr_model = ocr_predictor(pretrained=True)
-        easyocr_reader = easyocr.Reader([language])
-        paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
-        return doctr_model, easyocr_reader, paddleocr_reader
-    except Exception as e:
-        logging.error(f"Error loading models: {str(e)}")
-        return None, None, None
-def extract_text_aws(file_content):
-    try:
-        response = textract_client.detect_document_text(Document={'Bytes': file_content})
-        return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
-                for item in response['Blocks'] if item['BlockType'] == 'WORD']
-    except Exception as e:
-        logging.error(f"Error in AWS Textract: {str(e)}")
-        return []
-def extract_text_doctr(image, model):
-    try:
-        doc = DocumentFile.from_images([image])
-        result = model(doc)
-        return [(word.value, word.geometry, word.confidence)
-                for block in result.pages[0].blocks for line in block.lines for word in line.words]
-    except Exception as e:
-        logging.error(f"Error in Doctr OCR: {str(e)}")
-        return []
-def extract_text_easyocr(image, reader):
-    try:
-        result = reader.readtext(np.array(image))
-        return [(detection[1], detection[0], detection[2]) for detection in result]
-    except Exception as e:
-        logging.error(f"Error in EasyOCR: {str(e)}")
-        return []
-def extract_text_paddleocr(image, reader):
-    try:
-        result = reader.ocr(image, cls=True)
-        return [(line[1][0], line[0], line[1][1]) for line in result[0]]
-    except Exception as e:
-        logging.error(f"Error in PaddleOCR: {str(e)}")
-        return []
-def bbox_to_polygon(bbox):
-    if isinstance(bbox, dict):  # Format AWS
-        return Polygon([(bbox['Left'], bbox['Top']),
-                        (bbox['Left']+bbox['Width'], bbox['Top']),
-                        (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
-                        (bbox['Left'], bbox['Top']+bbox['Height'])])
-    elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox):  # Format EasyOCR
-        return Polygon(bbox)
-    elif len(bbox) == 2:  # Format Doctr
-        x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
-        return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
-    else:
-        raise ValueError(f"Unsupported bbox format: {bbox}")
-def combine_ocr_results(results, weights):
-    combined_words = []
-    for method, words in results.items():
-        for word, bbox, confidence in words:
-            try:
-                polygon = bbox_to_polygon(bbox)
-                combined_words.append((word, polygon, float(confidence) * weights[method]))
-            except Exception as e:
-                logging.error(f"Error processing word '{word}' from {method}: {str(e)}")
-    final_words = []
-    while combined_words:
-        current_word = combined_words.pop(0)
-        overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
-        if overlapping:
-            best_word = max([current_word] + overlapping, key=lambda x: x[2])
-            final_words.append(best_word[0])
-            for word in overlapping:
-                combined_words.remove(word)
-        else:
-            final_words.append(current_word[0])
-    return ' '.join(final_words)
-def detect_language(text):
-    try:
-        lang, _ = langid.classify(text)
-        return lang
-    except Exception as e:
-        logging.error(f"Error in language detection: {str(e)}")
-        return 'en'  # Default to English