Prernas19 commited on
Commit
9e79309
·
verified ·
1 Parent(s): adcf648

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -0
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import fitz # Importing PyMuPDF for PDF text extraction
4
+ import nltk
5
+ from transformers import BertTokenizer, BertModel
6
+ import torch
7
+ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ import pandas as pd
11
+ import gradio as gr
12
+
13
+ # Download NLTK data files
14
+ nltk.download('punkt')
15
+ nltk.download('stopwords')
16
+
17
+ # Load BERT model and tokenizer
18
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
19
+ model = BertModel.from_pretrained('bert-base-uncased')
20
+
21
+ # Function to preprocess text
22
+ def preprocess_text(text):
23
+ text = re.sub(r'\W+', ' ', text.lower()) # Remove non-alphanumeric characters and lower case
24
+ return text
25
+
26
+ # Function to extract keywords using TF-IDF
27
+ def extract_keywords_tfidf(text, max_features=50):
28
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
29
+ tfidf_matrix = vectorizer.fit_transform([text])
30
+ feature_names = vectorizer.get_feature_names_out()
31
+ tfidf_scores = tfidf_matrix.toarray().flatten()
32
+ keyword_scores = sorted(zip(tfidf_scores, feature_names), reverse=True)
33
+ return [keyword for score, keyword in keyword_scores]
34
+
35
+ # Function to extract text from a PDF
36
+ def extract_text_from_pdf(pdf_path):
37
+ document = fitz.open(pdf_path)
38
+ text = ""
39
+ for page_num in range(len(document)):
40
+ page = document.load_page(page_num)
41
+ text += page.get_text()
42
+ return text
43
+
44
+ # Function to give feedback on resume
45
+ def give_feedback(resume_text, job_description):
46
+ feedback = []
47
+
48
+ # Check formatting (example: consistency in bullet points)
49
+ if '•' in resume_text and '-' in resume_text:
50
+ feedback.append("Consider using a consistent bullet point style throughout your resume.")
51
+
52
+ # Check for grammar and spelling
53
+ if not any(re.findall(r'\bexperience\b|\beducation\b|\bskills\b', resume_text.lower())):
54
+ feedback.append("Make sure your resume includes sections like Experience, Education, and Skills.")
55
+
56
+ # Extract keywords and check relevance
57
+ jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
58
+ resume_keywords = extract_keywords_tfidf(preprocess_text(resume_text))
59
+
60
+ common_keywords = set(jd_keywords).intersection(set(resume_keywords))
61
+ if len(common_keywords) < 8:
62
+ feedback.append(f"Your resume could better match the job description. Consider adding keywords such as: {', '.join(jd_keywords[:5])}.")
63
+
64
+ # Check for action verbs
65
+ action_verbs = ["managed", "led", "developed", "designed", "implemented", "created"]
66
+ if not any(verb in resume_text.lower() for verb in action_verbs):
67
+ feedback.append("Consider using strong action verbs to describe your achievements and responsibilities.")
68
+
69
+ if not re.search(r'\bsummary\b|\bobjective\b', resume_text, re.IGNORECASE):
70
+ feedback.append("Consider adding a professional summary or objective statement to provide a quick overview of your qualifications.")
71
+
72
+ # Check for quantifiable achievements
73
+ if not re.findall(r'\d+', resume_text):
74
+ feedback.append("Include quantifiable achievements in your experience section (e.g., increased sales by 20%).")
75
+
76
+ # Provide positive feedback if none of the above conditions are met
77
+ if not feedback:
78
+ feedback.append("Your resume is well-aligned with the job description. Ensure to keep it updated with relevant keywords and achievements.")
79
+
80
+ return feedback
81
+
82
+ # Function to compute BERT embeddings
83
+ def get_bert_embeddings(text):
84
+ tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
85
+ with torch.no_grad():
86
+ outputs = model(**tokens)
87
+ return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
88
+
89
+ # Function to calculate BERT similarity score
90
+ def bert_similarity(resume, job_description):
91
+ resume_embedding = get_bert_embeddings(resume)
92
+ job_description_embedding = get_bert_embeddings(job_description)
93
+ cosine_sim = cosine_similarity([resume_embedding], [job_description_embedding])
94
+ return cosine_sim[0][0]
95
+
96
+ # Function to calculate TF-IDF cosine similarity score
97
+ def tfidf_cosine_similarity(resume, jd):
98
+ documents = [resume, jd]
99
+ vectorizer = TfidfVectorizer()
100
+ tfidf_matrix = vectorizer.fit_transform(documents)
101
+ cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
102
+ return cosine_sim[0][0]
103
+
104
+ # Function to calculate Doc2Vec cosine similarity score
105
+ def doc2vec_cosine_similarity(resume, jd, model):
106
+ resume_vector = model.infer_vector(resume.split())
107
+ jd_vector = model.infer_vector(jd.split())
108
+ cosine_sim = cosine_similarity([resume_vector], [jd_vector])
109
+ return cosine_sim[0][0]
110
+
111
+ # Function to extract years of experience from resume
112
+ def extract_years_of_experience(text):
113
+ years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
114
+ if years:
115
+ return sum(map(int, years))
116
+ return 0
117
+
118
+ # Function to extract information from resumes in a folder
119
+ def extract_info_from_resumes(resume_files, job_description):
120
+ data = []
121
+
122
+ # Train Doc2Vec model on resumes and job description
123
+ documents = []
124
+ for file in resume_files:
125
+ text = extract_text_from_pdf(file.name)
126
+ documents.append(preprocess_text(text))
127
+
128
+ documents.append(preprocess_text(job_description))
129
+ tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
130
+ model_doc2vec = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)
131
+
132
+ for file in resume_files:
133
+ text = extract_text_from_pdf(file.name)
134
+
135
+ preprocessed_text = preprocess_text(text)
136
+ resume_keywords = extract_keywords_tfidf(preprocessed_text)
137
+ years_of_experience = extract_years_of_experience(text)
138
+
139
+ # Append years of experience to the resume keywords
140
+ if years_of_experience > 0:
141
+ resume_keywords.append(f"{years_of_experience} years experience")
142
+
143
+ name = os.path.splitext(os.path.basename(file.name))[0]
144
+
145
+ feedback = give_feedback(text, job_description)
146
+
147
+ # Calculate scores
148
+ jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
149
+ common_keywords = set(jd_keywords).intersection(set(resume_keywords))
150
+ keyword_match_score = len(common_keywords) # Count of common keywords as a whole number
151
+ tfidf_score = tfidf_cosine_similarity(text, job_description)
152
+ doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model_doc2vec)
153
+ bert_score = bert_similarity(preprocessed_text, preprocess_text(job_description))
154
+
155
+ data.append({
156
+ 'Name': name,
157
+ 'Keyword_Match_Score': keyword_match_score, # Whole number
158
+ 'TFIDF_Score': tfidf_score,
159
+ 'Doc2Vec_Score': doc2vec_score,
160
+ 'BERT_Score': bert_score, # Add BERT score
161
+ 'Years_of_Experience': years_of_experience,
162
+ 'Feedback': '; '.join(feedback), # Combine feedback into a single string
163
+ })
164
+
165
+ return data
166
+
167
+ # Function to save data to an Excel file
168
+ def save_to_excel(data, output_file):
169
+ df = pd.DataFrame(data)
170
+ try:
171
+ df.to_excel(output_file, index=False)
172
+ return output_file
173
+ except Exception as e:
174
+ return f"Error saving file: {e}"
175
+
176
+ # Gradio interface function
177
+ def gradio_interface(resume_files, job_description):
178
+ if resume_files:
179
+ output_file = '/content/Resume_Analysis.xlsx'
180
+ resumes = extract_info_from_resumes(resume_files, job_description)
181
+ result = save_to_excel(resumes, output_file)
182
+ else:
183
+ result = "No resumes to process."
184
+
185
+ return result
186
+
187
+ # Gradio UI setup
188
+ iface = gr.Interface(
189
+ fn=gradio_interface,
190
+ inputs=[
191
+ gr.Files(label="Upload multiple Resumes", type="filepath"), # Accept multiple file uploads
192
+ gr.Textbox(label="Job Description", lines=5, placeholder="Enter the job description here...")
193
+ ],
194
+ outputs=gr.File(label="Download Results"), # Provide the output file
195
+
196
+ description="Upload multiple resume PDFs and provide a job description to analyze the resumes and get an Excel file with the results."
197
+ )
198
+
199
+ # Launch the Gradio interface
200
+ iface.launch(inline = False)