Spaces:
Sleeping
Sleeping
import fitz # PyMuPDF for PDF text extraction | |
import re | |
import spacy | |
from transformers import pipeline | |
from docx import Document | |
import dateparser | |
from datetime import datetime | |
from nltk.corpus import words | |
from models.load_models import nlp_spacy, nlp_ner | |
# NLTK words | |
english_words = set(words.words()) | |
# Function to refine ORG entities | |
def refine_org_entities(entities): | |
refined_entities = set() | |
company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.'] | |
for entity in entities: | |
if any(entity.endswith(suffix) for suffix in company_suffixes): | |
refined_entities.add(entity) | |
elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity): | |
refined_entities.add(entity) | |
return list(refined_entities) | |
# Function to extract ORG entities using NER | |
def extract_orgs(text): | |
ner_results = nlp_ner(text) | |
orgs = set() | |
for entity in ner_results: | |
if entity['entity_group'] == 'ORG': | |
orgs.add(entity['word']) | |
return refine_org_entities(orgs) | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
text = "" | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
text += page.get_text() | |
return text | |
# Function to extract text from DOCX | |
def extract_text_from_doc(doc_file): | |
doc = Document(doc_file) | |
text = '\n'.join([para.text for para in doc.paragraphs]) | |
return text | |
# Function to extract experience | |
def extract_experience(doc): | |
experience = 0 | |
for ent in doc.ents: | |
if ent.label_ == "DATE": | |
date = dateparser.parse(ent.text) | |
if date: | |
experience = max(experience, datetime.now().year - date.year) | |
return experience | |
# Function to extract phone numbers | |
def extract_phone(text): | |
phone_patterns = [ | |
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b', | |
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b' | |
] | |
for pattern in phone_patterns: | |
match = re.search(pattern, text) | |
if match: | |
return match.group() | |
return "Not found" | |
# Function to extract email addresses | |
def extract_email(text): | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
match = re.search(email_pattern, text) | |
return match.group() if match else "Not found" | |
# Function to extract colleges | |
def extract_colleges(doc): | |
colleges = set() | |
edu_keywords = ["university", "college", "institute", "school"] | |
for ent in doc.ents: | |
if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords): | |
colleges.add(ent.text) | |
return list(colleges) | |
# Function to extract LinkedIn profile | |
def extract_linkedin(text): | |
linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?' | |
match = re.search(linkedin_pattern, text) | |
return match.group() if match else "Not found" | |
# Main function to extract resume data | |
def extract_resume_data(uploaded_file): | |
file_ext = uploaded_file.name.split('.')[-1].lower() | |
# Extract text based on file type | |
if file_ext == 'pdf': | |
resume_text = extract_text_from_pdf(uploaded_file) | |
elif file_ext in ['docx', 'doc']: | |
resume_text = extract_text_from_doc(uploaded_file) | |
else: | |
raise ValueError("Unsupported file format.") | |
if not resume_text.strip(): | |
raise ValueError("The resume appears to be empty.") | |
# Process the resume text using SpaCy | |
doc = nlp_spacy(resume_text) | |
# Extract required information | |
companies = extract_orgs(resume_text) | |
experience = extract_experience(doc) | |
phone = extract_phone(resume_text) | |
email = extract_email(resume_text) | |
colleges = extract_colleges(doc) | |
linkedin = extract_linkedin(resume_text) | |
return { | |
"Years of Experience": experience, | |
"Companies Worked For": ", ".join(companies), | |
"Phone Number": phone, | |
"Email ID": email, | |
"Colleges Attended": ", ".join(colleges), | |
"LinkedIn ID": linkedin | |
}, resume_text | |