Nassiraaa commited on
Commit
3e52ceb
·
verified ·
1 Parent(s): 6f9afdd

Delete personal_info_extractor.py

Browse files
Files changed (1) hide show
  1. personal_info_extractor.py +0 -140
personal_info_extractor.py DELETED
@@ -1,140 +0,0 @@
1
- import re
2
- import json
3
- import logging
4
- from huggingface_hub import hf_hub_download
5
- from llama_cpp import Llama
6
- from cv_prompt import get_location_prompt
7
- from ocr_utils import combine_ocr_results, extract_text_aws, extract_text_doctr, extract_text_easyocr, extract_text_paddleocr, load_models, detect_language
8
- from config import weights
9
-
10
- # Configure logging
11
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
-
13
- def load_model():
14
- try:
15
- model_path = hf_hub_download("TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf")
16
- return Llama(model_path=model_path, n_ctx=32768, n_gpu_layers=2)
17
- except Exception as e:
18
- logging.error(f"Error loading model: {str(e)}")
19
- return None
20
-
21
- llm = load_model()
22
-
23
- with open('personal_info_scores.json', 'r') as f:
24
- personal_info_scores = json.load(f)
25
-
26
- def is_valid_email(email):
27
- parts = email.split('@')
28
- return len(parts) == 2 and parts[1].lower() == 'gmail.com'
29
-
30
- def clean_phone_number(phone):
31
- cleaned = ''.join(char for char in phone if char.isdigit() or char == '+')
32
- if cleaned.startswith('00'):
33
- cleaned = '+' + cleaned[2:]
34
- if not cleaned.startswith('+'):
35
- cleaned = '+' + cleaned
36
- return cleaned if 7 <= len(cleaned) <= 15 else None
37
-
38
- def extract_email_and_phone(text):
39
- gmail_regex = r'\b[A-Za-z0-9._%+-]+@gmail\.com\b'
40
- phone_regex = r'''(?x)
41
- (?:
42
- (?:(?:\+|00)(?:\d{1,3})[\s.-]?)?
43
- (?:
44
- (?:\(?\d{1,4}\)?[\s.-]?){1,3}
45
- \d{3,4}[\s.-]?\d{3,4}
46
- )
47
- |
48
- (?:\d{3,4}[\s.-]?){2,3}\d{3,4}
49
- )
50
- '''
51
- email_matches = re.findall(gmail_regex, text, re.IGNORECASE)
52
- phone_matches = re.findall(phone_regex, text)
53
-
54
- valid_emails = [e for e in email_matches if is_valid_email(e)]
55
- valid_phones = [clean_phone_number(p) for p in phone_matches if clean_phone_number(p)]
56
-
57
- email = valid_emails[0] if valid_emails else None
58
- phone = valid_phones[0] if valid_phones else None
59
-
60
- return email, phone
61
-
62
- def extract_location(text):
63
- if llm is None:
64
- logging.error("LLM model not loaded")
65
- return None, None
66
-
67
- location_prompt = get_location_prompt(text)
68
- try:
69
- output = llm(location_prompt, max_tokens=100)
70
- generated_text = output['choices'][0]['text'].strip()
71
-
72
- if "Not found" in generated_text:
73
- return None, None
74
-
75
- city_match = re.search(r'City:\s*(.+?),', generated_text)
76
- country_match = re.search(r'Country:\s*(.+)', generated_text)
77
-
78
- city = city_match.group(1) if city_match else None
79
- country = country_match.group(1) if country_match else None
80
-
81
- return city, country
82
- except Exception as e:
83
- logging.error(f"Error extracting location: {str(e)}")
84
- return None, None
85
-
86
- def analyze_personal_info(file_path):
87
- try:
88
- # Extract text using OCR
89
- with open(file_path, 'rb') as f:
90
- file_content = f.read()
91
-
92
- # Detect language
93
- detected_language = detect_language(file_content)
94
-
95
- # Load OCR models
96
- doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
97
-
98
- # Extract text using different OCR methods
99
- results = {
100
- "aws": extract_text_aws(file_content),
101
- "doctr": extract_text_doctr(file_path, doctr_model),
102
- "easyocr": extract_text_easyocr(file_path, easyocr_reader),
103
- "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
104
- }
105
-
106
- # Combine OCR results
107
- text = combine_ocr_results(results, weights)
108
-
109
- # Extract personal information
110
- city, country = extract_location(text)
111
- email, phone = extract_email_and_phone(text)
112
-
113
- # Calculate score
114
- score = 0
115
- if email:
116
- score += personal_info_scores['email']
117
- if phone:
118
- score += personal_info_scores['phone']
119
- if city:
120
- score += personal_info_scores['city']
121
- if country:
122
- score += personal_info_scores['country']
123
-
124
- return {
125
- "email": email,
126
- "phone": phone,
127
- "city": city,
128
- "country": country,
129
- "score_personal_information": score
130
- }
131
- except Exception as e:
132
- logging.error(f"Error in personal info analysis: {str(e)}")
133
- return {
134
- "email": None,
135
- "phone": None,
136
- "city": None,
137
- "country": None,
138
- "score_personal_information": 0,
139
- "error": str(e)
140
- }