bangaboy commited on
Commit
61c1a82
·
verified ·
1 Parent(s): bec9bd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -10
app.py CHANGED
@@ -1,14 +1,129 @@
 
 
1
  import streamlit as st
2
- from resume_parser import extract_resume_data
3
- from gemini_api import authenticate_gemini, generate_summary
 
 
 
 
 
 
4
 
5
- # Your hardcoded API key (You can also set this in environment variables)
6
- api_key = "AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def main():
9
  st.title("Resume Analyzer")
10
  st.write("Upload a resume to extract information")
11
 
 
12
  uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
13
 
14
  if uploaded_file is not None:
@@ -18,18 +133,44 @@ def main():
18
  if model is None:
19
  return
20
 
21
- # Extract resume data
22
- extracted_data, resume_text = extract_resume_data(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Generate summary using Gemini API
 
25
  summary = generate_summary(resume_text, model)
 
 
 
 
 
26
 
27
  # Display results
28
  st.subheader("Extracted Information")
29
- for key, value in extracted_data.items():
30
- st.write(f"*{key}:* {value}")
 
 
 
 
 
 
31
 
32
- st.subheader("Generated Summary")
33
  st.write(summary)
34
 
35
  except Exception as e:
 
1
+ import google.generativeai as genai
2
+ import fitz # PyMuPDF for PDF text extraction
3
  import streamlit as st
4
+ import spacy
5
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
6
+ from docx import Document
7
+ import re
8
+ from nltk.corpus import words
9
+ import dateparser
10
+ from datetime import datetime
11
+ import os
12
 
13
+ # Load SpaCy model for dependency parsing
14
+ nlp_spacy = spacy.load('en_core_web_sm')
15
 
16
+ # Load the NER model
17
+ tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
18
+ model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
19
+ nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
20
+
21
+ english_words = set(words.words())
22
+
23
+ # Your hardcoded API key
24
+ api_key ="AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
25
+
26
+ # Function to authenticate with Gemini API
27
+ def authenticate_gemini(api_key):
28
+ try:
29
+ genai.configure(api_key=api_key)
30
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
31
+ return model
32
+ except Exception as e:
33
+ st.error(f"Error configuring Gemini API: {e}")
34
+ return None
35
+
36
+ # Function to filter and refine extracted ORG entities
37
+ def refine_org_entities(entities):
38
+ refined_entities = set()
39
+ company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
40
+
41
+ for entity in entities:
42
+ if any(entity.endswith(suffix) for suffix in company_suffixes):
43
+ refined_entities.add(entity)
44
+ elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
45
+ refined_entities.add(entity)
46
+ return list(refined_entities)
47
+
48
+ # Function to extract ORG entities using NER
49
+ def extract_orgs(text):
50
+ ner_results = nlp_ner(text)
51
+ orgs = set()
52
+ for entity in ner_results:
53
+ if entity['entity_group'] == 'ORG':
54
+ orgs.add(entity['word'])
55
+
56
+ return refine_org_entities(orgs)
57
+
58
+ # Extract text from PDF
59
+ def extract_text_from_pdf(pdf_file):
60
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
61
+ text = ""
62
+ for page_num in range(doc.page_count):
63
+ page = doc.load_page(page_num)
64
+ text += page.get_text()
65
+ return text
66
+
67
+ # Extract text from DOCX
68
+ def extract_text_from_doc(doc_file):
69
+ doc = Document(doc_file)
70
+ text = '\n'.join([para.text for para in doc.paragraphs])
71
+ return text
72
+
73
+ # Summary generation function
74
+ def generate_summary(text, model):
75
+ prompt = f"Can you summarize the following document in 100 words?\n\n{text}"
76
+ try:
77
+ response = model.generate_content(prompt)
78
+ return response.text
79
+ except Exception as e:
80
+ return f"Error generating summary: {str(e)}"
81
+
82
+ # Additional resume parsing functions
83
+ def extract_experience(doc):
84
+ experience = 0
85
+ for ent in doc.ents:
86
+ if ent.label_ == "DATE":
87
+ date = dateparser.parse(ent.text)
88
+ if date:
89
+ experience = max(experience, datetime.now().year - date.year)
90
+ return experience
91
+
92
+ def extract_phone(text):
93
+ phone_patterns = [
94
+ r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
95
+ r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
96
+ ]
97
+ for pattern in phone_patterns:
98
+ match = re.search(pattern, text)
99
+ if match:
100
+ return match.group()
101
+ return "Not found"
102
+
103
+ def extract_email(text):
104
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
105
+ match = re.search(email_pattern, text)
106
+ return match.group() if match else "Not found"
107
+
108
+ def extract_colleges(doc):
109
+ colleges = set()
110
+ edu_keywords = ["university", "college", "institute", "school"]
111
+ for ent in doc.ents:
112
+ if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
113
+ colleges.add(ent.text)
114
+ return list(colleges)
115
+
116
+ def extract_linkedin(text):
117
+ linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
118
+ match = re.search(linkedin_pattern, text)
119
+ return match.group() if match else "Not found"
120
+
121
+ # Main function to process the resume and return the analysis
122
  def main():
123
  st.title("Resume Analyzer")
124
  st.write("Upload a resume to extract information")
125
 
126
+ # File uploader for resume input
127
  uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
128
 
129
  if uploaded_file is not None:
 
133
  if model is None:
134
  return
135
 
136
+ # Extract text from the uploaded resume
137
+ file_ext = uploaded_file.name.split('.')[-1].lower()
138
+ if file_ext == 'pdf':
139
+ resume_text = extract_text_from_pdf(uploaded_file)
140
+ elif file_ext in ['docx', 'doc']:
141
+ resume_text = extract_text_from_doc(uploaded_file)
142
+ else:
143
+ st.error("Unsupported file format.")
144
+ return
145
+
146
+ if not resume_text.strip():
147
+ st.error("The resume appears to be empty.")
148
+ return
149
+
150
+ # Process the resume
151
+ doc = nlp_spacy(resume_text)
152
 
153
+ # Extract information
154
+ companies = extract_orgs(resume_text)
155
  summary = generate_summary(resume_text, model)
156
+ experience = extract_experience(doc)
157
+ phone = extract_phone(resume_text)
158
+ email = extract_email(resume_text)
159
+ colleges = extract_colleges(doc)
160
+ linkedin = extract_linkedin(resume_text)
161
 
162
  # Display results
163
  st.subheader("Extracted Information")
164
+ st.write(f"*Years of Experience:* {experience}")
165
+ st.write("*Companies Worked For:*")
166
+ st.write(", ".join(companies))
167
+ st.write(f"*Phone Number:* {phone}")
168
+ st.write(f"*Email ID:* {email}")
169
+ st.write("*Colleges Attended:*")
170
+ st.write(", ".join(colleges))
171
+ st.write(f"*LinkedIn ID:* {linkedin}")
172
 
173
+ st.write("Generated Summary")
174
  st.write(summary)
175
 
176
  except Exception as e: