File size: 4,133 Bytes
d9f2dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import fitz  # PyMuPDF for PDF text extraction
import re
import spacy
from transformers import pipeline
from docx import Document
import dateparser
from datetime import datetime
from nltk.corpus import words
from models.load_models import nlp_spacy, nlp_ner

# NLTK words
english_words = set(words.words())

# Function to refine ORG entities
def refine_org_entities(entities):
    refined_entities = set()
    company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']

    for entity in entities:
        if any(entity.endswith(suffix) for suffix in company_suffixes):
            refined_entities.add(entity)
        elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
            refined_entities.add(entity)
    return list(refined_entities)

# Function to extract ORG entities using NER
def extract_orgs(text):
    ner_results = nlp_ner(text)
    orgs = set()
    for entity in ner_results:
        if entity['entity_group'] == 'ORG':
            orgs.add(entity['word'])
    return refine_org_entities(orgs)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Function to extract text from DOCX
def extract_text_from_doc(doc_file):
    doc = Document(doc_file)
    text = '\n'.join([para.text for para in doc.paragraphs])
    return text

# Function to extract experience
def extract_experience(doc):
    experience = 0
    for ent in doc.ents:
        if ent.label_ == "DATE":
            date = dateparser.parse(ent.text)
            if date:
                experience = max(experience, datetime.now().year - date.year)
    return experience

# Function to extract phone numbers
def extract_phone(text):
    phone_patterns = [
        r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    ]
    for pattern in phone_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()
    return "Not found"

# Function to extract email addresses
def extract_email(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    match = re.search(email_pattern, text)
    return match.group() if match else "Not found"

# Function to extract colleges
def extract_colleges(doc):
    colleges = set()
    edu_keywords = ["university", "college", "institute", "school"]
    for ent in doc.ents:
        if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
            colleges.add(ent.text)
    return list(colleges)

# Function to extract LinkedIn profile
def extract_linkedin(text):
    linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
    match = re.search(linkedin_pattern, text)
    return match.group() if match else "Not found"

# Main function to extract resume data
def extract_resume_data(uploaded_file):
    file_ext = uploaded_file.name.split('.')[-1].lower()
    
    # Extract text based on file type
    if file_ext == 'pdf':
        resume_text = extract_text_from_pdf(uploaded_file)
    elif file_ext in ['docx', 'doc']:
        resume_text = extract_text_from_doc(uploaded_file)
    else:
        raise ValueError("Unsupported file format.")

    if not resume_text.strip():
        raise ValueError("The resume appears to be empty.")

    # Process the resume text using SpaCy
    doc = nlp_spacy(resume_text)

    # Extract required information
    companies = extract_orgs(resume_text)
    experience = extract_experience(doc)
    phone = extract_phone(resume_text)
    email = extract_email(resume_text)
    colleges = extract_colleges(doc)
    linkedin = extract_linkedin(resume_text)

    return {
        "Years of Experience": experience,
        "Companies Worked For": ", ".join(companies),
        "Phone Number": phone,
        "Email ID": email,
        "Colleges Attended": ", ".join(colleges),
        "LinkedIn ID": linkedin
    }, resume_text