Prernas19 commited on
Commit
66dea93
·
verified ·
1 Parent(s): 2928099

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -0
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spacy
3
+ import re
4
+ import pdfplumber
5
+ import docx
6
+ import nltk
7
+ from nltk.corpus import words
8
+
9
+ # Load the spaCy model
10
+ nlp = spacy.load("en_core_web_sm")
11
+
12
+ # Set of English words
13
+ nltk.download('words', quiet=True)
14
+ english_words = set(words.words())
15
+
16
+ def extract_text(file):
17
+ try:
18
+ if file.name.endswith('.pdf'):
19
+ return extract_text_from_pdf(file)
20
+ elif file.name.endswith('.docx'):
21
+ return extract_text_from_docx(file)
22
+ else:
23
+ return "Unsupported file format"
24
+ except Exception as e:
25
+ return f"Error extracting text: {str(e)}"
26
+
27
+ def extract_text_from_pdf(file):
28
+ text = ''
29
+ with pdfplumber.open(file) as pdf:
30
+ for page in pdf.pages:
31
+ text += page.extract_text() or ''
32
+ return text
33
+
34
+ def extract_text_from_docx(file):
35
+ doc = docx.Document(file)
36
+ return "\n".join([para.text for para in doc.paragraphs])
37
+
38
+ def extract_companies(text):
39
+ doc = nlp(text)
40
+ companies = []
41
+
42
+ company_pattern = re.compile(
43
+ r'\b(?:Inc\.|Corp\.|LLC|Ltd\.|Co\.|Company|Group|Services|Technologies|Pvt\.|Solutions|Consulting|Associates|Enterprises|Partners|Holdings|Systems|Networks|Ventures|Partners|International|Ltd|GmbH|S\.A\.|S\.L\.|LLP|PLC|AG|LLC)\b', re.IGNORECASE)
44
+
45
+ for ent in doc.ents:
46
+ if ent.label_ == "ORG" and company_pattern.search(ent.text):
47
+ companies.append(ent.text)
48
+
49
+ # Join companies with new lines
50
+ return "\n".join(companies)
51
+
52
+ def extract_colleges(text):
53
+ doc = nlp(text)
54
+ colleges = []
55
+
56
+ edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
57
+
58
+ for sent in doc.sents:
59
+ edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
60
+ for edu in edu_ents:
61
+ colleges.append(edu.text)
62
+
63
+ # Join colleges with new lines
64
+ return "\n".join(colleges)
65
+
66
+ def extract_years_of_experience(text):
67
+ years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
68
+ months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE)
69
+
70
+ total_years = sum(map(int, years))
71
+ total_months = sum(map(int, months))
72
+
73
+ total_experience_years = total_years + (total_months // 12)
74
+ total_experience_months = total_months % 12
75
+
76
+ return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available"
77
+
78
+ def extract_phone(text):
79
+ phone_patterns = [
80
+ r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
81
+ r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
82
+ ]
83
+ for pattern in phone_patterns:
84
+ match = re.search(pattern, text)
85
+ if match:
86
+ return match.group()
87
+ return "Not found"
88
+
89
+ def extract_email(text):
90
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
91
+ match = re.search(email_pattern, text)
92
+ return match.group() if match else "Not found"
93
+
94
+ def extract_summary(doc):
95
+ sentences = list(doc.sents)
96
+ summary = []
97
+ for sent in sentences:
98
+ if len(summary) >= 3: # Limit to 3 sentences
99
+ break
100
+ if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7:
101
+ summary.append(sent.text)
102
+ return " ".join(summary)
103
+
104
+ def extract_linkedin(text):
105
+ linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
106
+ match = re.search(linkedin_pattern, text)
107
+ return match.group() if match else "Not found"
108
+
109
+ def parse_resume(file):
110
+ try:
111
+ text = extract_text(file)
112
+ if text.startswith("Error") or text == "Unsupported file format":
113
+ return {"Error": text}
114
+
115
+ doc = nlp(text)
116
+
117
+ companies = extract_companies(text)
118
+ colleges = extract_colleges(text)
119
+ years_of_experience = extract_years_of_experience(text)
120
+ phone = extract_phone(text)
121
+ email = extract_email(text)
122
+ summary = extract_summary(doc)
123
+ linkedin = extract_linkedin(text)
124
+
125
+ return companies, colleges, years_of_experience, phone, email, summary, linkedin
126
+
127
+ except Exception as e:
128
+ import traceback
129
+ return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
130
+
131
+ # Create Gradio interface with separate output components
132
+ iface = gr.Interface(
133
+ fn=parse_resume,
134
+ inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
135
+ outputs=[
136
+ gr.Textbox(label="Companies Worked For", lines=10),
137
+ gr.Textbox(label="Colleges Attended", lines=10),
138
+ gr.Textbox(label="Years of Experience"),
139
+ gr.Textbox(label="Phone Number"),
140
+ gr.Textbox(label="Email ID"),
141
+ gr.Textbox(label="Summary", lines=3),
142
+ gr.Textbox(label="LinkedIn ID")
143
+ ],
144
+ title="Advanced Resume Parser",
145
+ description="Upload a resume in PDF or DOCX format to extract key information."
146
+ )
147
+
148
+ iface.launch(share=True)