Nassiraaa commited on
Commit
dc649a8
·
verified ·
1 Parent(s): 6d4869c

Update ocr_functions.py

Browse files
Files changed (1) hide show
  1. ocr_functions.py +59 -69
ocr_functions.py CHANGED
@@ -1,75 +1,65 @@
1
- import json
2
- import re
3
- from hf_utils import get_ai_response
4
- from cv_prompt import get_personal_info_prompt
5
- from cv_quality import CV
 
 
 
6
 
7
- # Load the scoring data
8
- with open('personal_info_scores.json', 'r') as f:
9
- score_data = json.load(f)
 
 
 
 
 
 
 
 
10
 
11
- def extract_email(text):
12
- email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
13
- emails = re.findall(email_pattern, text)
14
- return emails[0] if emails else None
 
 
 
 
 
 
15
 
16
- def extract_phone(text):
17
- phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
18
- phones = re.findall(phone_pattern, text)
19
- return phones[0] if phones else None
20
 
21
- def extract_location(text):
22
- prompt = get_personal_info_prompt(text)
23
- messages = [
24
- {"role": "user", "content": prompt}
25
- ]
26
-
27
- response = get_ai_response(messages)
28
-
29
- if response:
30
- try:
31
- location_data = json.loads(response)
32
- city_present = any(location_data.get('city', {}).values())
33
- country_present = any(location_data.get('country', {}).values())
34
- except json.JSONDecodeError:
35
- print("Failed to parse JSON from response")
36
- city_present, country_present = False, False
37
- else:
38
- city_present, country_present = False, False
39
-
40
- return city_present, country_present
41
 
42
- def calculate_score(email_exists, phone_exists, city_exists, country_exists):
43
- score = 0
44
- if email_exists:
45
- score += score_data['email']
46
- if phone_exists:
47
- score += score_data['phone']
48
- if city_exists:
49
- score += score_data['city']
50
- if country_exists:
51
- score += score_data['country']
52
- return score
53
 
54
- def analyze_personal_info(file_path):
55
- cv = CV(file_path)
56
- text = cv.get_cv_text()
57
-
58
- email = extract_email(text)
59
- phone = extract_phone(text)
60
- city_present, country_present = extract_location(text)
61
-
62
- email_exists = email is not None
63
- phone_exists = phone is not None
64
-
65
- score = calculate_score(email_exists, phone_exists, city_present, country_present)
66
-
67
- result = {
68
- "email": email_exists,
69
- "phone": phone_exists,
70
- "city": city_present,
71
- "country": country_present,
72
- "personal_info_score": score
73
- }
74
-
75
- return result
 
1
+ from dotenv import load_dotenv
2
+ import io
3
+ import boto3
4
+ from paddleocr import PaddleOCR
5
+ import os
6
+ import pytesseract
7
+ from PIL import ImageFilter
8
+ import numpy as np
9
 
10
+ def textract_ocr(image, box):
11
+ load_dotenv()
12
+ x1, y1, x2, y2 = box
13
+ cropped_image = image.crop((x1, y1, x2, y2))
14
+ cropped_image = cropped_image.convert("L")
15
+ img_bytes = io.BytesIO()
16
+ cropped_image.save(img_bytes, format='PNG')
17
+ img_bytes = img_bytes.getvalue()
18
+ client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
19
+ aws_secret_access_key=os.getenv('aws_secret_access_key')
20
+ )
21
 
22
+ response = client.detect_document_text(Document={'Bytes': img_bytes})
23
+ blocks = response['Blocks']
24
+ texttract = ""
25
+ line_confidence = {}
26
+ for block in blocks:
27
+ if(block['BlockType'] == 'LINE'):
28
+ line_confidence[block['Text']] = block['Confidence']
29
+ texttract+= block['Text']+"\n"
30
+
31
+ return texttract
32
 
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ def paddle_ocr(image,box):
36
+ x1, y1, x2, y2 = box
37
+ cropped_image = image.crop((x1, y1, x2, y2))
38
+ cropped_image = np.array(cropped_image)
39
+ ocr = PaddleOCR(use_angle_cls=False, lang='latin')
40
+ result = ocr.ocr(cropped_image, cls=False)
41
+ text= ""
42
+ if result [0] != None:
43
+ result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
44
+ text = [x[1][0] for x in result[0]]
45
+ return "\n".join(text)
46
 
47
+
48
+
49
+ def tesseract_ocr(image, box):
50
+ target_dpi = 300
51
+ x1, y1, x2, y2 = box
52
+ cropped_image = image.crop((x1, y1, x2, y2))
53
+ cropped_image = cropped_image.convert("L")
54
+
55
+ current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
56
+
57
+ if current_dpi:
58
+ scale_factor = target_dpi / current_dpi
59
+ else:
60
+
61
+ scale_factor = 1.0
62
+ binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
63
+ binarized_image = binarized_image.point(lambda p: p > 180 and 255)
64
+ text = pytesseract.image_to_string(binarized_image, config="--psm 6")
65
+ return text