Spaces:
Sleeping
Sleeping
Delete personal_info_extractor.py
Browse files- personal_info_extractor.py +0 -140
personal_info_extractor.py
DELETED
@@ -1,140 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import json
|
3 |
-
import logging
|
4 |
-
from huggingface_hub import hf_hub_download
|
5 |
-
from llama_cpp import Llama
|
6 |
-
from cv_prompt import get_location_prompt
|
7 |
-
from ocr_utils import combine_ocr_results, extract_text_aws, extract_text_doctr, extract_text_easyocr, extract_text_paddleocr, load_models, detect_language
|
8 |
-
from config import weights
|
9 |
-
|
10 |
-
# Configure logging
|
11 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
12 |
-
|
13 |
-
def load_model():
|
14 |
-
try:
|
15 |
-
model_path = hf_hub_download("TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf")
|
16 |
-
return Llama(model_path=model_path, n_ctx=32768, n_gpu_layers=2)
|
17 |
-
except Exception as e:
|
18 |
-
logging.error(f"Error loading model: {str(e)}")
|
19 |
-
return None
|
20 |
-
|
21 |
-
llm = load_model()
|
22 |
-
|
23 |
-
with open('personal_info_scores.json', 'r') as f:
|
24 |
-
personal_info_scores = json.load(f)
|
25 |
-
|
26 |
-
def is_valid_email(email):
|
27 |
-
parts = email.split('@')
|
28 |
-
return len(parts) == 2 and parts[1].lower() == 'gmail.com'
|
29 |
-
|
30 |
-
def clean_phone_number(phone):
|
31 |
-
cleaned = ''.join(char for char in phone if char.isdigit() or char == '+')
|
32 |
-
if cleaned.startswith('00'):
|
33 |
-
cleaned = '+' + cleaned[2:]
|
34 |
-
if not cleaned.startswith('+'):
|
35 |
-
cleaned = '+' + cleaned
|
36 |
-
return cleaned if 7 <= len(cleaned) <= 15 else None
|
37 |
-
|
38 |
-
def extract_email_and_phone(text):
|
39 |
-
gmail_regex = r'\b[A-Za-z0-9._%+-]+@gmail\.com\b'
|
40 |
-
phone_regex = r'''(?x)
|
41 |
-
(?:
|
42 |
-
(?:(?:\+|00)(?:\d{1,3})[\s.-]?)?
|
43 |
-
(?:
|
44 |
-
(?:\(?\d{1,4}\)?[\s.-]?){1,3}
|
45 |
-
\d{3,4}[\s.-]?\d{3,4}
|
46 |
-
)
|
47 |
-
|
|
48 |
-
(?:\d{3,4}[\s.-]?){2,3}\d{3,4}
|
49 |
-
)
|
50 |
-
'''
|
51 |
-
email_matches = re.findall(gmail_regex, text, re.IGNORECASE)
|
52 |
-
phone_matches = re.findall(phone_regex, text)
|
53 |
-
|
54 |
-
valid_emails = [e for e in email_matches if is_valid_email(e)]
|
55 |
-
valid_phones = [clean_phone_number(p) for p in phone_matches if clean_phone_number(p)]
|
56 |
-
|
57 |
-
email = valid_emails[0] if valid_emails else None
|
58 |
-
phone = valid_phones[0] if valid_phones else None
|
59 |
-
|
60 |
-
return email, phone
|
61 |
-
|
62 |
-
def extract_location(text):
|
63 |
-
if llm is None:
|
64 |
-
logging.error("LLM model not loaded")
|
65 |
-
return None, None
|
66 |
-
|
67 |
-
location_prompt = get_location_prompt(text)
|
68 |
-
try:
|
69 |
-
output = llm(location_prompt, max_tokens=100)
|
70 |
-
generated_text = output['choices'][0]['text'].strip()
|
71 |
-
|
72 |
-
if "Not found" in generated_text:
|
73 |
-
return None, None
|
74 |
-
|
75 |
-
city_match = re.search(r'City:\s*(.+?),', generated_text)
|
76 |
-
country_match = re.search(r'Country:\s*(.+)', generated_text)
|
77 |
-
|
78 |
-
city = city_match.group(1) if city_match else None
|
79 |
-
country = country_match.group(1) if country_match else None
|
80 |
-
|
81 |
-
return city, country
|
82 |
-
except Exception as e:
|
83 |
-
logging.error(f"Error extracting location: {str(e)}")
|
84 |
-
return None, None
|
85 |
-
|
86 |
-
def analyze_personal_info(file_path):
|
87 |
-
try:
|
88 |
-
# Extract text using OCR
|
89 |
-
with open(file_path, 'rb') as f:
|
90 |
-
file_content = f.read()
|
91 |
-
|
92 |
-
# Detect language
|
93 |
-
detected_language = detect_language(file_content)
|
94 |
-
|
95 |
-
# Load OCR models
|
96 |
-
doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
|
97 |
-
|
98 |
-
# Extract text using different OCR methods
|
99 |
-
results = {
|
100 |
-
"aws": extract_text_aws(file_content),
|
101 |
-
"doctr": extract_text_doctr(file_path, doctr_model),
|
102 |
-
"easyocr": extract_text_easyocr(file_path, easyocr_reader),
|
103 |
-
"paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
|
104 |
-
}
|
105 |
-
|
106 |
-
# Combine OCR results
|
107 |
-
text = combine_ocr_results(results, weights)
|
108 |
-
|
109 |
-
# Extract personal information
|
110 |
-
city, country = extract_location(text)
|
111 |
-
email, phone = extract_email_and_phone(text)
|
112 |
-
|
113 |
-
# Calculate score
|
114 |
-
score = 0
|
115 |
-
if email:
|
116 |
-
score += personal_info_scores['email']
|
117 |
-
if phone:
|
118 |
-
score += personal_info_scores['phone']
|
119 |
-
if city:
|
120 |
-
score += personal_info_scores['city']
|
121 |
-
if country:
|
122 |
-
score += personal_info_scores['country']
|
123 |
-
|
124 |
-
return {
|
125 |
-
"email": email,
|
126 |
-
"phone": phone,
|
127 |
-
"city": city,
|
128 |
-
"country": country,
|
129 |
-
"score_personal_information": score
|
130 |
-
}
|
131 |
-
except Exception as e:
|
132 |
-
logging.error(f"Error in personal info analysis: {str(e)}")
|
133 |
-
return {
|
134 |
-
"email": None,
|
135 |
-
"phone": None,
|
136 |
-
"city": None,
|
137 |
-
"country": None,
|
138 |
-
"score_personal_information": 0,
|
139 |
-
"error": str(e)
|
140 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|