Nassiraaa commited on
Commit
436d369
·
verified ·
1 Parent(s): 4ad4d2b

Update ocr_functions.py

Browse files
Files changed (1) hide show
  1. ocr_functions.py +71 -55
ocr_functions.py CHANGED
@@ -1,59 +1,75 @@
1
- from dotenv import load_dotenv
2
- import io
3
- import boto3
4
- from paddleocr import PaddleOCR
5
- import os
6
- import pytesseract
7
- from PIL import ImageFilter
8
- import numpy as np
9
 
10
- def textract_ocr(image, box):
11
- load_dotenv()
12
- x1, y1, x2, y2 = box
13
- cropped_image = image.crop((x1, y1, x2, y2))
14
- cropped_image = cropped_image.convert("L")
15
- img_bytes = io.BytesIO()
16
- cropped_image.save(img_bytes, format='PNG')
17
- img_bytes = img_bytes.getvalue()
18
-
19
- client = boto3.client('textract',
20
- region_name='eu-west-3',
21
- aws_access_key_id=os.getenv("aws_access_key_id"),
22
- aws_secret_access_key=os.getenv('aws_secret_access_key'))
23
-
24
- response = client.detect_document_text(Document={'Bytes': img_bytes})
25
- blocks = response['Blocks']
26
- texttract = ""
27
- line_confidence = {}
28
- for block in blocks:
29
- if(block['BlockType'] == 'LINE'):
30
- line_confidence[block['Text']] = block['Confidence']
31
- texttract += block['Text'] + "\n"
32
- return texttract
33
 
34
- def paddle_ocr(image, box):
35
- x1, y1, x2, y2 = box
36
- cropped_image = image.crop((x1, y1, x2, y2))
37
- cropped_image = np.array(cropped_image)
38
- ocr = PaddleOCR(use_angle_cls=False, lang='latin')
39
- result = ocr.ocr(cropped_image, cls=False)
40
- text = ""
41
- if result[0] is not None:
42
- result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
43
- text = [x[1][0] for x in result[0]]
44
- return "\n".join(text)
45
 
46
- def tesseract_ocr(image, box):
47
- target_dpi = 300
48
- x1, y1, x2, y2 = box
49
- cropped_image = image.crop((x1, y1, x2, y2))
50
- cropped_image = cropped_image.convert("L")
51
- current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
52
- if current_dpi:
53
- scale_factor = target_dpi / current_dpi
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  else:
55
- scale_factor = 1.0
56
- binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
57
- binarized_image = binarized_image.point(lambda p: p > 180 and 255)
58
- text = pytesseract.image_to_string(binarized_image, config="--psm 6")
59
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from hf_utils import get_ai_response
4
+ from cv_prompt import get_personal_info_prompt
5
+ from cv_quality import CV
 
 
 
6
 
7
+ # Load the scoring data
8
+ with open('personal_info_scores.json', 'r') as f:
9
+ score_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ def extract_email(text):
12
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
13
+ emails = re.findall(email_pattern, text)
14
+ return emails[0] if emails else None
 
 
 
 
 
 
 
15
 
16
+ def extract_phone(text):
17
+ phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
18
+ phones = re.findall(phone_pattern, text)
19
+ return phones[0] if phones else None
20
+
21
+ def extract_location(text):
22
+ prompt = get_personal_info_prompt(text)
23
+ messages = [
24
+ {"role": "user", "content": prompt}
25
+ ]
26
+
27
+ response = get_ai_response(messages)
28
+
29
+ if response:
30
+ try:
31
+ location_data = json.loads(response)
32
+ city_present = any(location_data.get('city', {}).values())
33
+ country_present = any(location_data.get('country', {}).values())
34
+ except json.JSONDecodeError:
35
+ print("Failed to parse JSON from response")
36
+ city_present, country_present = False, False
37
  else:
38
+ city_present, country_present = False, False
39
+
40
+ return city_present, country_present
41
+
42
+ def calculate_score(email_exists, phone_exists, city_exists, country_exists):
43
+ score = 0
44
+ if email_exists:
45
+ score += score_data['email']
46
+ if phone_exists:
47
+ score += score_data['phone']
48
+ if city_exists:
49
+ score += score_data['city']
50
+ if country_exists:
51
+ score += score_data['country']
52
+ return score
53
+
54
+ def analyze_personal_info(file_path):
55
+ cv = CV(file_path)
56
+ text = cv.get_cv_text()
57
+
58
+ email = extract_email(text)
59
+ phone = extract_phone(text)
60
+ city_present, country_present = extract_location(text)
61
+
62
+ email_exists = email is not None
63
+ phone_exists = phone is not None
64
+
65
+ score = calculate_score(email_exists, phone_exists, city_present, country_present)
66
+
67
+ result = {
68
+ "email": email_exists,
69
+ "phone": phone_exists,
70
+ "city": city_present,
71
+ "country": country_present,
72
+ "personal_info_score": score
73
+ }
74
+
75
+ return result