Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 7, 2024

Commit

436d369

verified ·

1 Parent(s): 4ad4d2b

Update ocr_functions.py

Browse files

Files changed (1) hide show

ocr_functions.py +71 -55

ocr_functions.py CHANGED Viewed

@@ -1,59 +1,75 @@
-from dotenv import load_dotenv
-import io
-import boto3
-from paddleocr import PaddleOCR
-import os
-import pytesseract
-from PIL import ImageFilter
-import numpy as np
-def textract_ocr(image, box):
-    load_dotenv()
-    x1, y1, x2, y2 = box
-    cropped_image = image.crop((x1, y1, x2, y2))
-    cropped_image = cropped_image.convert("L")
-    img_bytes = io.BytesIO()
-    cropped_image.save(img_bytes, format='PNG')
-    img_bytes = img_bytes.getvalue()
-    client = boto3.client('textract',
-                          region_name='eu-west-3',
-                          aws_access_key_id=os.getenv("aws_access_key_id"),
-                          aws_secret_access_key=os.getenv('aws_secret_access_key'))
-    response = client.detect_document_text(Document={'Bytes': img_bytes})
-    blocks = response['Blocks']
-    texttract = ""
-    line_confidence = {}
-    for block in blocks:
-        if(block['BlockType'] == 'LINE'):
-            line_confidence[block['Text']] = block['Confidence']
-            texttract += block['Text'] + "\n"
-    return texttract
-def paddle_ocr(image, box):
-    x1, y1, x2, y2 = box
-    cropped_image = image.crop((x1, y1, x2, y2))
-    cropped_image = np.array(cropped_image)
-    ocr = PaddleOCR(use_angle_cls=False, lang='latin')
-    result = ocr.ocr(cropped_image, cls=False)
-    text = ""
-    if result[0] is not None:
-        result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
-        text = [x[1][0] for x in result[0]]
-    return "\n".join(text)
-def tesseract_ocr(image, box):
-    target_dpi = 300
-    x1, y1, x2, y2 = box
-    cropped_image = image.crop((x1, y1, x2, y2))
-    cropped_image = cropped_image.convert("L")
-    current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
-    if current_dpi:
-        scale_factor = target_dpi / current_dpi
     else:
-        scale_factor = 1.0
-    binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
-    binarized_image = binarized_image.point(lambda p: p > 180 and 255)
-    text = pytesseract.image_to_string(binarized_image, config="--psm 6")
-    return text

+import json
+import re
+from hf_utils import get_ai_response
+from cv_prompt import get_personal_info_prompt
+from cv_quality import CV
+# Load the scoring data
+with open('personal_info_scores.json', 'r') as f:
+    score_data = json.load(f)
+def extract_email(text):
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    emails = re.findall(email_pattern, text)
+    return emails[0] if emails else None
+def extract_phone(text):
+    phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
+    phones = re.findall(phone_pattern, text)
+    return phones[0] if phones else None
+def extract_location(text):
+    prompt = get_personal_info_prompt(text)
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    response = get_ai_response(messages)
+    if response:
+        try:
+            location_data = json.loads(response)
+            city_present = any(location_data.get('city', {}).values())
+            country_present = any(location_data.get('country', {}).values())
+        except json.JSONDecodeError:
+            print("Failed to parse JSON from response")
+            city_present, country_present = False, False
     else:
+        city_present, country_present = False, False
+    return city_present, country_present
+def calculate_score(email_exists, phone_exists, city_exists, country_exists):
+    score = 0
+    if email_exists:
+        score += score_data['email']
+    if phone_exists:
+        score += score_data['phone']
+    if city_exists:
+        score += score_data['city']
+    if country_exists:
+        score += score_data['country']
+    return score
+def analyze_personal_info(file_path):
+    cv = CV(file_path)
+    text = cv.get_cv_text()
+    email = extract_email(text)
+    phone = extract_phone(text)
+    city_present, country_present = extract_location(text)
+    email_exists = email is not None
+    phone_exists = phone is not None
+    score = calculate_score(email_exists, phone_exists, city_present, country_present)
+    result = {
+        "email": email_exists,
+        "phone": phone_exists,
+        "city": city_present,
+        "country": country_present,
+        "personal_info_score": score
+    }
+    return result