Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 3, 2024

Commit

3e52ceb

verified ·

1 Parent(s): 6f9afdd

Delete personal_info_extractor.py

Browse files

Files changed (1) hide show

personal_info_extractor.py +0 -140

personal_info_extractor.py DELETED Viewed

@@ -1,140 +0,0 @@
-import re
-import json
-import logging
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-from cv_prompt import get_location_prompt
-from ocr_utils import combine_ocr_results, extract_text_aws, extract_text_doctr, extract_text_easyocr, extract_text_paddleocr, load_models, detect_language
-from config import weights
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-def load_model():
-    try:
-        model_path = hf_hub_download("TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf")
-        return Llama(model_path=model_path, n_ctx=32768, n_gpu_layers=2)
-    except Exception as e:
-        logging.error(f"Error loading model: {str(e)}")
-        return None
-llm = load_model()
-with open('personal_info_scores.json', 'r') as f:
-    personal_info_scores = json.load(f)
-def is_valid_email(email):
-    parts = email.split('@')
-    return len(parts) == 2 and parts[1].lower() == 'gmail.com'
-def clean_phone_number(phone):
-    cleaned = ''.join(char for char in phone if char.isdigit() or char == '+')
-    if cleaned.startswith('00'):
-        cleaned = '+' + cleaned[2:]
-    if not cleaned.startswith('+'):
-        cleaned = '+' + cleaned
-    return cleaned if 7 <= len(cleaned) <= 15 else None
-def extract_email_and_phone(text):
-    gmail_regex = r'\b[A-Za-z0-9._%+-]+@gmail\.com\b'
-    phone_regex = r'''(?x)
-    (?:
-        (?:(?:\+|00)(?:\d{1,3})[\s.-]?)?
-        (?:
-            (?:\(?\d{1,4}\)?[\s.-]?){1,3}
-            \d{3,4}[\s.-]?\d{3,4}
-        )
-        |
-        (?:\d{3,4}[\s.-]?){2,3}\d{3,4}
-    )
-    '''
-    email_matches = re.findall(gmail_regex, text, re.IGNORECASE)
-    phone_matches = re.findall(phone_regex, text)
-    valid_emails = [e for e in email_matches if is_valid_email(e)]
-    valid_phones = [clean_phone_number(p) for p in phone_matches if clean_phone_number(p)]
-    email = valid_emails[0] if valid_emails else None
-    phone = valid_phones[0] if valid_phones else None
-    return email, phone
-def extract_location(text):
-    if llm is None:
-        logging.error("LLM model not loaded")
-        return None, None
-    location_prompt = get_location_prompt(text)
-    try:
-        output = llm(location_prompt, max_tokens=100)
-        generated_text = output['choices'][0]['text'].strip()
-        if "Not found" in generated_text:
-            return None, None
-        city_match = re.search(r'City:\s*(.+?),', generated_text)
-        country_match = re.search(r'Country:\s*(.+)', generated_text)
-        city = city_match.group(1) if city_match else None
-        country = country_match.group(1) if country_match else None
-        return city, country
-    except Exception as e:
-        logging.error(f"Error extracting location: {str(e)}")
-        return None, None
-def analyze_personal_info(file_path):
-    try:
-        # Extract text using OCR
-        with open(file_path, 'rb') as f:
-            file_content = f.read()
-        # Detect language
-        detected_language = detect_language(file_content)
-        # Load OCR models
-        doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
-        # Extract text using different OCR methods
-        results = {
-            "aws": extract_text_aws(file_content),
-            "doctr": extract_text_doctr(file_path, doctr_model),
-            "easyocr": extract_text_easyocr(file_path, easyocr_reader),
-            "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
-        }
-        # Combine OCR results
-        text = combine_ocr_results(results, weights)
-        # Extract personal information
-        city, country = extract_location(text)
-        email, phone = extract_email_and_phone(text)
-        # Calculate score
-        score = 0
-        if email:
-            score += personal_info_scores['email']
-        if phone:
-            score += personal_info_scores['phone']
-        if city:
-            score += personal_info_scores['city']
-        if country:
-            score += personal_info_scores['country']
-        return {
-            "email": email,
-            "phone": phone,
-            "city": city,
-            "country": country,
-            "score_personal_information": score
-        }
-    except Exception as e:
-        logging.error(f"Error in personal info analysis: {str(e)}")
-        return {
-            "email": None,
-            "phone": None,
-            "city": None,
-            "country": None,
-            "score_personal_information": 0,
-            "error": str(e)
-        }