import fitz # PyMuPDF for PDF text extraction import re import spacy from transformers import pipeline from docx import Document import dateparser from datetime import datetime from nltk.corpus import words from models.load_models import nlp_spacy, nlp_ner # NLTK words english_words = set(words.words()) # Function to refine ORG entities def refine_org_entities(entities): refined_entities = set() company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.'] for entity in entities: if any(entity.endswith(suffix) for suffix in company_suffixes): refined_entities.add(entity) elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity): refined_entities.add(entity) return list(refined_entities) # Function to extract ORG entities using NER def extract_orgs(text): ner_results = nlp_ner(text) orgs = set() for entity in ner_results: if entity['entity_group'] == 'ORG': orgs.add(entity['word']) return refine_org_entities(orgs) # Function to extract text from PDF def extract_text_from_pdf(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page_num in range(doc.page_count): page = doc.load_page(page_num) text += page.get_text() return text # Function to extract text from DOCX def extract_text_from_doc(doc_file): doc = Document(doc_file) text = '\n'.join([para.text for para in doc.paragraphs]) return text # Function to extract experience def extract_experience(doc): experience = 0 for ent in doc.ents: if ent.label_ == "DATE": date = dateparser.parse(ent.text) if date: experience = max(experience, datetime.now().year - date.year) return experience # Function to extract phone numbers def extract_phone(text): phone_patterns = [ r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b', r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b' ] for pattern in phone_patterns: match = re.search(pattern, text) if match: return match.group() return "Not found" # Function to extract email addresses def extract_email(text): email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' match = re.search(email_pattern, text) return match.group() if match else "Not found" # Function to extract colleges def extract_colleges(doc): colleges = set() edu_keywords = ["university", "college", "institute", "school"] for ent in doc.ents: if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords): colleges.add(ent.text) return list(colleges) # Function to extract LinkedIn profile def extract_linkedin(text): linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?' match = re.search(linkedin_pattern, text) return match.group() if match else "Not found" # Main function to extract resume data def extract_resume_data(uploaded_file): file_ext = uploaded_file.name.split('.')[-1].lower() # Extract text based on file type if file_ext == 'pdf': resume_text = extract_text_from_pdf(uploaded_file) elif file_ext in ['docx', 'doc']: resume_text = extract_text_from_doc(uploaded_file) else: raise ValueError("Unsupported file format.") if not resume_text.strip(): raise ValueError("The resume appears to be empty.") # Process the resume text using SpaCy doc = nlp_spacy(resume_text) # Extract required information companies = extract_orgs(resume_text) experience = extract_experience(doc) phone = extract_phone(resume_text) email = extract_email(resume_text) colleges = extract_colleges(doc) linkedin = extract_linkedin(resume_text) return { "Years of Experience": experience, "Companies Worked For": ", ".join(companies), "Phone Number": phone, "Email ID": email, "Colleges Attended": ", ".join(colleges), "LinkedIn ID": linkedin }, resume_text