Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 7, 2024

Commit

dc649a8

verified ·

1 Parent(s): 6d4869c

Update ocr_functions.py

Browse files

Files changed (1) hide show

ocr_functions.py +59 -69

ocr_functions.py CHANGED Viewed

@@ -1,75 +1,65 @@
-import json
-import re
-from hf_utils import get_ai_response
-from cv_prompt import get_personal_info_prompt
-from cv_quality import CV
-# Load the scoring data
-with open('personal_info_scores.json', 'r') as f:
-    score_data = json.load(f)
-def extract_email(text):
-    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
-    emails = re.findall(email_pattern, text)
-    return emails[0] if emails else None
-def extract_phone(text):
-    phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
-    phones = re.findall(phone_pattern, text)
-    return phones[0] if phones else None
-def extract_location(text):
-    prompt = get_personal_info_prompt(text)
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    response = get_ai_response(messages)
-    if response:
-        try:
-            location_data = json.loads(response)
-            city_present = any(location_data.get('city', {}).values())
-            country_present = any(location_data.get('country', {}).values())
-        except json.JSONDecodeError:
-            print("Failed to parse JSON from response")
-            city_present, country_present = False, False
-    else:
-        city_present, country_present = False, False
-    return city_present, country_present
-def calculate_score(email_exists, phone_exists, city_exists, country_exists):
-    score = 0
-    if email_exists:
-        score += score_data['email']
-    if phone_exists:
-        score += score_data['phone']
-    if city_exists:
-        score += score_data['city']
-    if country_exists:
-        score += score_data['country']
-    return score
-def analyze_personal_info(file_path):
-    cv = CV(file_path)
-    text = cv.get_cv_text()
-    email = extract_email(text)
-    phone = extract_phone(text)
-    city_present, country_present = extract_location(text)
-    email_exists = email is not None
-    phone_exists = phone is not None
-    score = calculate_score(email_exists, phone_exists, city_present, country_present)
-    result = {
-        "email": email_exists,
-        "phone": phone_exists,
-        "city": city_present,
-        "country": country_present,
-        "personal_info_score": score
-    }
-    return result

+from dotenv import load_dotenv
+import io
+import boto3
+from paddleocr import PaddleOCR
+import os
+import pytesseract
+from PIL import ImageFilter
+import numpy as np
+def textract_ocr(image, box):
+    load_dotenv()
+    x1, y1, x2, y2 = box
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image = cropped_image.convert("L")
+    img_bytes = io.BytesIO()
+    cropped_image.save(img_bytes, format='PNG')
+    img_bytes = img_bytes.getvalue()
+    client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
+                          aws_secret_access_key=os.getenv('aws_secret_access_key')
+    )
+    response = client.detect_document_text(Document={'Bytes': img_bytes})
+    blocks = response['Blocks']
+    texttract = ""
+    line_confidence = {}
+    for block in blocks:
+        if(block['BlockType'] == 'LINE'):
+            line_confidence[block['Text']] = block['Confidence']
+            texttract+= block['Text']+"\n"
+    return texttract
+def paddle_ocr(image,box):
+    x1, y1, x2, y2 = box
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image = np.array(cropped_image)
+    ocr = PaddleOCR(use_angle_cls=False, lang='latin')
+    result = ocr.ocr(cropped_image, cls=False)
+    text= ""
+    if result [0] != None:
+        result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
+        text = [x[1][0] for x in result[0]]
+    return "\n".join(text)
+def tesseract_ocr(image, box):
+    target_dpi = 300
+    x1, y1, x2, y2 = box
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image = cropped_image.convert("L")
+    current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
+    if current_dpi:
+        scale_factor = target_dpi / current_dpi
+    else:
+        scale_factor = 1.0
+    binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
+    binarized_image = binarized_image.point(lambda p: p > 180 and 255)
+    text = pytesseract.image_to_string(binarized_image, config="--psm 6")
+    return text