Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,129 @@
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
#
|
6 |
-
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def main():
|
9 |
st.title("Resume Analyzer")
|
10 |
st.write("Upload a resume to extract information")
|
11 |
|
|
|
12 |
uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
|
13 |
|
14 |
if uploaded_file is not None:
|
@@ -18,18 +133,44 @@ def main():
|
|
18 |
if model is None:
|
19 |
return
|
20 |
|
21 |
-
# Extract resume
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
#
|
|
|
25 |
summary = generate_summary(resume_text, model)
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# Display results
|
28 |
st.subheader("Extracted Information")
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
st.
|
33 |
st.write(summary)
|
34 |
|
35 |
except Exception as e:
|
|
|
1 |
+
import google.generativeai as genai
|
2 |
+
import fitz # PyMuPDF for PDF text extraction
|
3 |
import streamlit as st
|
4 |
+
import spacy
|
5 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
6 |
+
from docx import Document
|
7 |
+
import re
|
8 |
+
from nltk.corpus import words
|
9 |
+
import dateparser
|
10 |
+
from datetime import datetime
|
11 |
+
import os
|
12 |
|
13 |
+
# Load SpaCy model for dependency parsing
|
14 |
+
nlp_spacy = spacy.load('en_core_web_sm')
|
15 |
|
16 |
+
# Load the NER model
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
|
18 |
+
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
|
19 |
+
nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
20 |
+
|
21 |
+
english_words = set(words.words())
|
22 |
+
|
23 |
+
# Your hardcoded API key
|
24 |
+
api_key ="AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
|
25 |
+
|
26 |
+
# Function to authenticate with Gemini API
|
27 |
+
def authenticate_gemini(api_key):
|
28 |
+
try:
|
29 |
+
genai.configure(api_key=api_key)
|
30 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
|
31 |
+
return model
|
32 |
+
except Exception as e:
|
33 |
+
st.error(f"Error configuring Gemini API: {e}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
# Function to filter and refine extracted ORG entities
|
37 |
+
def refine_org_entities(entities):
|
38 |
+
refined_entities = set()
|
39 |
+
company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
|
40 |
+
|
41 |
+
for entity in entities:
|
42 |
+
if any(entity.endswith(suffix) for suffix in company_suffixes):
|
43 |
+
refined_entities.add(entity)
|
44 |
+
elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
|
45 |
+
refined_entities.add(entity)
|
46 |
+
return list(refined_entities)
|
47 |
+
|
48 |
+
# Function to extract ORG entities using NER
|
49 |
+
def extract_orgs(text):
|
50 |
+
ner_results = nlp_ner(text)
|
51 |
+
orgs = set()
|
52 |
+
for entity in ner_results:
|
53 |
+
if entity['entity_group'] == 'ORG':
|
54 |
+
orgs.add(entity['word'])
|
55 |
+
|
56 |
+
return refine_org_entities(orgs)
|
57 |
+
|
58 |
+
# Extract text from PDF
|
59 |
+
def extract_text_from_pdf(pdf_file):
|
60 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
61 |
+
text = ""
|
62 |
+
for page_num in range(doc.page_count):
|
63 |
+
page = doc.load_page(page_num)
|
64 |
+
text += page.get_text()
|
65 |
+
return text
|
66 |
+
|
67 |
+
# Extract text from DOCX
|
68 |
+
def extract_text_from_doc(doc_file):
|
69 |
+
doc = Document(doc_file)
|
70 |
+
text = '\n'.join([para.text for para in doc.paragraphs])
|
71 |
+
return text
|
72 |
+
|
73 |
+
# Summary generation function
|
74 |
+
def generate_summary(text, model):
|
75 |
+
prompt = f"Can you summarize the following document in 100 words?\n\n{text}"
|
76 |
+
try:
|
77 |
+
response = model.generate_content(prompt)
|
78 |
+
return response.text
|
79 |
+
except Exception as e:
|
80 |
+
return f"Error generating summary: {str(e)}"
|
81 |
+
|
82 |
+
# Additional resume parsing functions
|
83 |
+
def extract_experience(doc):
|
84 |
+
experience = 0
|
85 |
+
for ent in doc.ents:
|
86 |
+
if ent.label_ == "DATE":
|
87 |
+
date = dateparser.parse(ent.text)
|
88 |
+
if date:
|
89 |
+
experience = max(experience, datetime.now().year - date.year)
|
90 |
+
return experience
|
91 |
+
|
92 |
+
def extract_phone(text):
|
93 |
+
phone_patterns = [
|
94 |
+
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
|
95 |
+
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
|
96 |
+
]
|
97 |
+
for pattern in phone_patterns:
|
98 |
+
match = re.search(pattern, text)
|
99 |
+
if match:
|
100 |
+
return match.group()
|
101 |
+
return "Not found"
|
102 |
+
|
103 |
+
def extract_email(text):
|
104 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
105 |
+
match = re.search(email_pattern, text)
|
106 |
+
return match.group() if match else "Not found"
|
107 |
+
|
108 |
+
def extract_colleges(doc):
|
109 |
+
colleges = set()
|
110 |
+
edu_keywords = ["university", "college", "institute", "school"]
|
111 |
+
for ent in doc.ents:
|
112 |
+
if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
|
113 |
+
colleges.add(ent.text)
|
114 |
+
return list(colleges)
|
115 |
+
|
116 |
+
def extract_linkedin(text):
|
117 |
+
linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
|
118 |
+
match = re.search(linkedin_pattern, text)
|
119 |
+
return match.group() if match else "Not found"
|
120 |
+
|
121 |
+
# Main function to process the resume and return the analysis
|
122 |
def main():
|
123 |
st.title("Resume Analyzer")
|
124 |
st.write("Upload a resume to extract information")
|
125 |
|
126 |
+
# File uploader for resume input
|
127 |
uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
|
128 |
|
129 |
if uploaded_file is not None:
|
|
|
133 |
if model is None:
|
134 |
return
|
135 |
|
136 |
+
# Extract text from the uploaded resume
|
137 |
+
file_ext = uploaded_file.name.split('.')[-1].lower()
|
138 |
+
if file_ext == 'pdf':
|
139 |
+
resume_text = extract_text_from_pdf(uploaded_file)
|
140 |
+
elif file_ext in ['docx', 'doc']:
|
141 |
+
resume_text = extract_text_from_doc(uploaded_file)
|
142 |
+
else:
|
143 |
+
st.error("Unsupported file format.")
|
144 |
+
return
|
145 |
+
|
146 |
+
if not resume_text.strip():
|
147 |
+
st.error("The resume appears to be empty.")
|
148 |
+
return
|
149 |
+
|
150 |
+
# Process the resume
|
151 |
+
doc = nlp_spacy(resume_text)
|
152 |
|
153 |
+
# Extract information
|
154 |
+
companies = extract_orgs(resume_text)
|
155 |
summary = generate_summary(resume_text, model)
|
156 |
+
experience = extract_experience(doc)
|
157 |
+
phone = extract_phone(resume_text)
|
158 |
+
email = extract_email(resume_text)
|
159 |
+
colleges = extract_colleges(doc)
|
160 |
+
linkedin = extract_linkedin(resume_text)
|
161 |
|
162 |
# Display results
|
163 |
st.subheader("Extracted Information")
|
164 |
+
st.write(f"*Years of Experience:* {experience}")
|
165 |
+
st.write("*Companies Worked For:*")
|
166 |
+
st.write(", ".join(companies))
|
167 |
+
st.write(f"*Phone Number:* {phone}")
|
168 |
+
st.write(f"*Email ID:* {email}")
|
169 |
+
st.write("*Colleges Attended:*")
|
170 |
+
st.write(", ".join(colleges))
|
171 |
+
st.write(f"*LinkedIn ID:* {linkedin}")
|
172 |
|
173 |
+
st.write("Generated Summary")
|
174 |
st.write(summary)
|
175 |
|
176 |
except Exception as e:
|