resume_analyser / resume_parser.py
bangaboy's picture
Create resume_parser.py
d9f2dff verified
raw
history blame
4.13 kB
import fitz # PyMuPDF for PDF text extraction
import re
import spacy
from transformers import pipeline
from docx import Document
import dateparser
from datetime import datetime
from nltk.corpus import words
from models.load_models import nlp_spacy, nlp_ner
# NLTK words
english_words = set(words.words())
# Function to refine ORG entities
def refine_org_entities(entities):
refined_entities = set()
company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
for entity in entities:
if any(entity.endswith(suffix) for suffix in company_suffixes):
refined_entities.add(entity)
elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
refined_entities.add(entity)
return list(refined_entities)
# Function to extract ORG entities using NER
def extract_orgs(text):
ner_results = nlp_ner(text)
orgs = set()
for entity in ner_results:
if entity['entity_group'] == 'ORG':
orgs.add(entity['word'])
return refine_org_entities(orgs)
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
text += page.get_text()
return text
# Function to extract text from DOCX
def extract_text_from_doc(doc_file):
doc = Document(doc_file)
text = '\n'.join([para.text for para in doc.paragraphs])
return text
# Function to extract experience
def extract_experience(doc):
experience = 0
for ent in doc.ents:
if ent.label_ == "DATE":
date = dateparser.parse(ent.text)
if date:
experience = max(experience, datetime.now().year - date.year)
return experience
# Function to extract phone numbers
def extract_phone(text):
phone_patterns = [
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
]
for pattern in phone_patterns:
match = re.search(pattern, text)
if match:
return match.group()
return "Not found"
# Function to extract email addresses
def extract_email(text):
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
match = re.search(email_pattern, text)
return match.group() if match else "Not found"
# Function to extract colleges
def extract_colleges(doc):
colleges = set()
edu_keywords = ["university", "college", "institute", "school"]
for ent in doc.ents:
if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
colleges.add(ent.text)
return list(colleges)
# Function to extract LinkedIn profile
def extract_linkedin(text):
linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
match = re.search(linkedin_pattern, text)
return match.group() if match else "Not found"
# Main function to extract resume data
def extract_resume_data(uploaded_file):
file_ext = uploaded_file.name.split('.')[-1].lower()
# Extract text based on file type
if file_ext == 'pdf':
resume_text = extract_text_from_pdf(uploaded_file)
elif file_ext in ['docx', 'doc']:
resume_text = extract_text_from_doc(uploaded_file)
else:
raise ValueError("Unsupported file format.")
if not resume_text.strip():
raise ValueError("The resume appears to be empty.")
# Process the resume text using SpaCy
doc = nlp_spacy(resume_text)
# Extract required information
companies = extract_orgs(resume_text)
experience = extract_experience(doc)
phone = extract_phone(resume_text)
email = extract_email(resume_text)
colleges = extract_colleges(doc)
linkedin = extract_linkedin(resume_text)
return {
"Years of Experience": experience,
"Companies Worked For": ", ".join(companies),
"Phone Number": phone,
"Email ID": email,
"Colleges Attended": ", ".join(colleges),
"LinkedIn ID": linkedin
}, resume_text