bangaboy commited on
Commit
d9f2dff
·
verified ·
1 Parent(s): d340145

Create resume_parser.py

Browse files
Files changed (1) hide show
  1. resume_parser.py +126 -0
resume_parser.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF for PDF text extraction
2
+ import re
3
+ import spacy
4
+ from transformers import pipeline
5
+ from docx import Document
6
+ import dateparser
7
+ from datetime import datetime
8
+ from nltk.corpus import words
9
+ from models.load_models import nlp_spacy, nlp_ner
10
+
11
+ # NLTK words
12
+ english_words = set(words.words())
13
+
14
+ # Function to refine ORG entities
15
+ def refine_org_entities(entities):
16
+ refined_entities = set()
17
+ company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
18
+
19
+ for entity in entities:
20
+ if any(entity.endswith(suffix) for suffix in company_suffixes):
21
+ refined_entities.add(entity)
22
+ elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
23
+ refined_entities.add(entity)
24
+ return list(refined_entities)
25
+
26
+ # Function to extract ORG entities using NER
27
+ def extract_orgs(text):
28
+ ner_results = nlp_ner(text)
29
+ orgs = set()
30
+ for entity in ner_results:
31
+ if entity['entity_group'] == 'ORG':
32
+ orgs.add(entity['word'])
33
+ return refine_org_entities(orgs)
34
+
35
+ # Function to extract text from PDF
36
+ def extract_text_from_pdf(pdf_file):
37
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
38
+ text = ""
39
+ for page_num in range(doc.page_count):
40
+ page = doc.load_page(page_num)
41
+ text += page.get_text()
42
+ return text
43
+
44
+ # Function to extract text from DOCX
45
+ def extract_text_from_doc(doc_file):
46
+ doc = Document(doc_file)
47
+ text = '\n'.join([para.text for para in doc.paragraphs])
48
+ return text
49
+
50
+ # Function to extract experience
51
+ def extract_experience(doc):
52
+ experience = 0
53
+ for ent in doc.ents:
54
+ if ent.label_ == "DATE":
55
+ date = dateparser.parse(ent.text)
56
+ if date:
57
+ experience = max(experience, datetime.now().year - date.year)
58
+ return experience
59
+
60
+ # Function to extract phone numbers
61
+ def extract_phone(text):
62
+ phone_patterns = [
63
+ r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
64
+ r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
65
+ ]
66
+ for pattern in phone_patterns:
67
+ match = re.search(pattern, text)
68
+ if match:
69
+ return match.group()
70
+ return "Not found"
71
+
72
+ # Function to extract email addresses
73
+ def extract_email(text):
74
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
75
+ match = re.search(email_pattern, text)
76
+ return match.group() if match else "Not found"
77
+
78
+ # Function to extract colleges
79
+ def extract_colleges(doc):
80
+ colleges = set()
81
+ edu_keywords = ["university", "college", "institute", "school"]
82
+ for ent in doc.ents:
83
+ if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
84
+ colleges.add(ent.text)
85
+ return list(colleges)
86
+
87
+ # Function to extract LinkedIn profile
88
+ def extract_linkedin(text):
89
+ linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
90
+ match = re.search(linkedin_pattern, text)
91
+ return match.group() if match else "Not found"
92
+
93
+ # Main function to extract resume data
94
+ def extract_resume_data(uploaded_file):
95
+ file_ext = uploaded_file.name.split('.')[-1].lower()
96
+
97
+ # Extract text based on file type
98
+ if file_ext == 'pdf':
99
+ resume_text = extract_text_from_pdf(uploaded_file)
100
+ elif file_ext in ['docx', 'doc']:
101
+ resume_text = extract_text_from_doc(uploaded_file)
102
+ else:
103
+ raise ValueError("Unsupported file format.")
104
+
105
+ if not resume_text.strip():
106
+ raise ValueError("The resume appears to be empty.")
107
+
108
+ # Process the resume text using SpaCy
109
+ doc = nlp_spacy(resume_text)
110
+
111
+ # Extract required information
112
+ companies = extract_orgs(resume_text)
113
+ experience = extract_experience(doc)
114
+ phone = extract_phone(resume_text)
115
+ email = extract_email(resume_text)
116
+ colleges = extract_colleges(doc)
117
+ linkedin = extract_linkedin(resume_text)
118
+
119
+ return {
120
+ "Years of Experience": experience,
121
+ "Companies Worked For": ", ".join(companies),
122
+ "Phone Number": phone,
123
+ "Email ID": email,
124
+ "Colleges Attended": ", ".join(colleges),
125
+ "LinkedIn ID": linkedin
126
+ }, resume_text