kylezhao101 commited on
Commit
bbed939
·
1 Parent(s): 2293223

Implement term ranking and plot comparison

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +168 -63
  3. requirements.txt +4 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🏢
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.32.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: final project for IAT360
 
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.8.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: final project for IAT360
app.py CHANGED
@@ -1,77 +1,182 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  import PyPDF2
 
 
 
 
 
 
4
 
5
  # Load the token classification pipeline
6
  model_name = "jjzha/jobbert_knowledge_extraction"
7
- pipe = pipeline("token-classification", model=model_name)
8
-
9
- # Function to extract and highlight key skills/words from the job posting
10
- def extract_keywords_with_highlights(job_posting_text):
11
- results = pipe(job_posting_text)
12
-
13
- # Fix the `##` issue by reconstructing full words
14
- reconstructed_text = ""
15
- highlighted_words = set()
16
- previous_end = 0
17
- for result in results:
18
- start, end, word = result['start'], result['end'], result['word']
19
- # Remove `##` for subwords
20
- clean_word = word.replace("##", "")
21
- highlighted_words.add(clean_word.lower())
22
- # Add text before the current word
23
- reconstructed_text += job_posting_text[previous_end:start]
24
- # Highlight the cleaned word
25
- reconstructed_text += (
26
- f'<span style="background-color:yellow; font-weight:bold;" '
27
- f'title="Entity: {result["entity"]} (Score: {result["score"]:.2f})">'
28
- f"{clean_word}</span>"
29
- )
30
- previous_end = end
31
- # Add the remaining text
32
- reconstructed_text += job_posting_text[previous_end:]
33
- # Replace newline characters with <br> to preserve line breaks
34
- reconstructed_text = reconstructed_text.replace("\n", "<br>")
35
-
36
- return (
37
- f'<div style="font-family:Arial, sans-serif; line-height:1.5;">{reconstructed_text}</div>',
38
- highlighted_words,
39
- )
40
-
41
- # Function to check if highlighted words are in the resume
42
- def check_keywords_in_resume(resume_file_path, job_posting_text):
43
- # Extract text from the uploaded PDF resume
44
- with open(resume_file_path, "rb") as file:
45
- pdf_reader = PyPDF2.PdfReader(file)
46
- resume_text = " ".join(page.extract_text() for page in pdf_reader.pages)
47
-
48
- # Extract highlighted keywords from the job posting
49
- highlighted_html, highlighted_words = extract_keywords_with_highlights(job_posting_text)
50
-
51
- # Check if each highlighted word is in the resume
52
- resume_words = set(resume_text.lower().split())
53
- matched_words = highlighted_words.intersection(resume_words)
54
- missing_words = highlighted_words - matched_words
55
-
56
- # Prepare a summary
57
- matched_summary = f"Matched Keywords: {', '.join(matched_words)}"
58
- missing_summary = f"Missing Keywords: {', '.join(missing_words)}"
59
- return highlighted_html, matched_summary, missing_summary
60
-
61
- # Set up Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  interface = gr.Interface(
63
- fn=check_keywords_in_resume,
64
  inputs=[
65
- gr.File(label="Upload Resume PDF", type="filepath"),
66
- gr.Textbox(label="Enter Job Posting Text", lines=30, placeholder="Paste job posting text here..."),
67
  ],
68
  outputs=[
69
- gr.HTML(label="Highlighted Key Skills/Words in Job Posting"),
70
- gr.Textbox(label="Matched Keywords"),
71
- gr.Textbox(label="Missing Keywords"),
 
72
  ],
73
- title="Resume vs Job Posting Skill Match with Highlights",
74
- description="Upload your resume and enter a job posting. The app will highlight key skills from the job posting and check if they are present in your resume.",
75
  )
76
 
77
  # Launch the Gradio app
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import PyPDF2
4
+ from PIL import Image
5
+ import matplotlib.pyplot as plt
6
+ from io import BytesIO
7
+ import pandas as pd # For displaying rankings in a table
8
+ import re
9
+ import math
10
 
11
  # Load the token classification pipeline
12
  model_name = "jjzha/jobbert_knowledge_extraction"
13
+ pipe = pipeline("token-classification", model=model_name, aggregation_strategy="first")
14
+
15
+ # Aggregate overlapping or adjacent spans into 1 entity
16
+ def aggregate_span(results):
17
+ new_results = []
18
+ current_result = results[0]
19
+ for result in results[1:]:
20
+ if result["start"] == current_result["end"] + 1:
21
+ current_result["word"] += " " + result["word"]
22
+ current_result["end"] = result["end"]
23
+ else:
24
+ new_results.append(current_result)
25
+ current_result = result
26
+ new_results.append(current_result)
27
+ return new_results
28
+
29
+ # Extract knowledge entities from job posting
30
+ def ner(text):
31
+ output_knowledge = pipe(text)
32
+ for result in output_knowledge:
33
+ if result.get("entity_group"):
34
+ result["entity"] = "Knowledge"
35
+ del result["entity_group"]
36
+ if len(output_knowledge) > 0:
37
+ output_knowledge = aggregate_span(output_knowledge)
38
+
39
+ return {"text": text, "entities": output_knowledge}
40
+
41
+ # Extract text from input PDF
42
+ def extract_pdf(pdf_file):
43
+ reader = PyPDF2.PdfReader(pdf_file)
44
+ text = ''
45
+ for page in reader.pages:
46
+ text += page.extract_text()
47
+ return text
48
+
49
+ def rank_knowledge(entities, job_posting_text, resume_text):
50
+ scores = {}
51
+ priority_keywords = ["must-have", "required", "preferred", "key", "important"]
52
+
53
+ for entity in entities:
54
+ term = entity["word"].lower()
55
+ term_score = 0
56
+
57
+ # Count exact matches of the term in the job posting
58
+ term_score += len(re.findall(rf'\b{re.escape(term)}\b', job_posting_text.lower()))
59
+
60
+ # Proximity to priority keywords
61
+ term_positions = [m.start() for m in re.finditer(rf'\b{re.escape(term)}\b', job_posting_text.lower())]
62
+ for keyword in priority_keywords:
63
+ keyword_positions = [m.start() for m in re.finditer(rf'\b{re.escape(keyword)}\b', job_posting_text.lower())]
64
+ for t_pos in term_positions:
65
+ for k_pos in keyword_positions:
66
+ if abs(t_pos - k_pos) < 20: # Within 20 characters
67
+ term_score += 1
68
+
69
+ scores[term] = term_score
70
+
71
+ # Normalize
72
+ max_score = max(scores.values(), default=1)
73
+ ranked_entities = [
74
+ {
75
+ "Term": k,
76
+ "Score": (math.log1p(v) / math.log1p(max_score)) * 100, # Log scaling
77
+ "In Resume": "Yes" if k in resume_text.lower() else "No"
78
+ }
79
+ for k, v in scores.items()
80
+ ]
81
+
82
+ ranked_entities.sort(key=lambda x: x["Score"], reverse=True)
83
+ return ranked_entities
84
+
85
+ # Compare extracted knowledge entities with the resume
86
+ def compare_with_resume(output_knowledge, resume_file):
87
+ resume_text = extract_pdf(resume_file) if resume_file else ''
88
+ matched_knowledge = []
89
+ unmatched_knowledge = []
90
+
91
+ for entity in output_knowledge:
92
+ if entity["word"].lower() in resume_text.lower():
93
+ matched_knowledge.append(entity["word"])
94
+ else:
95
+ unmatched_knowledge.append(entity["word"])
96
+ return matched_knowledge, unmatched_knowledge
97
+
98
+ def plot_comparison(matched_knowledge, unmatched_knowledge):
99
+ labels = ['Matched', 'Unmatched']
100
+ values = [len(matched_knowledge), len(unmatched_knowledge)]
101
+ total = sum(values)
102
+ percentages = [f"{(value / total * 100):.1f}%" for value in values]
103
+
104
+ plt.figure(figsize=(6, 4))
105
+ bars = plt.bar(labels, values, color=['green', 'red'])
106
+ plt.xlabel('Knowledge Match Status')
107
+ plt.ylabel('Count')
108
+ plt.title('Knowledge Match Comparison')
109
+ plt.tight_layout()
110
+
111
+ # Add percentage labels above bars
112
+ for bar, percentage in zip(bars, percentages):
113
+ plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, percentage, ha='center', fontsize=10)
114
+
115
+ buf = BytesIO()
116
+ plt.savefig(buf, format='png')
117
+ buf.seek(0)
118
+ plt.close()
119
+
120
+ return Image.open(buf)
121
+
122
+ def plot_pie_chart(ranked_knowledge, threshold=50):
123
+ # Filter terms above the threshold
124
+ filtered_terms = [term for term in ranked_knowledge if term["Score"] >= threshold]
125
+ matched_terms = sum(1 for term in filtered_terms if term["In Resume"] == "Yes")
126
+ unmatched_terms = len(filtered_terms) - matched_terms
127
+
128
+ # Data for pie chart
129
+ labels = ['Matched', 'Unmatched']
130
+ values = [matched_terms, unmatched_terms]
131
+
132
+ # Create pie chart
133
+ plt.figure(figsize=(6, 4))
134
+ plt.pie(values, labels=labels, autopct='%1.1f%%', colors=['green', 'red'], startangle=90)
135
+ plt.title(f"Terms Above Threshold (Score >= {threshold})")
136
+ buf = BytesIO()
137
+ plt.savefig(buf, format='png')
138
+ buf.seek(0)
139
+ plt.close()
140
+ return Image.open(buf)
141
+
142
+ def ner_and_compare_with_plot_and_rank(job_posting_text, resume_file):
143
+ """Combined function to process NER, comparison, ranking, and visualization."""
144
+ ner_result = ner(job_posting_text)
145
+ resume_text = extract_pdf(resume_file) if resume_file else ''
146
+ matched_knowledge, unmatched_knowledge = compare_with_resume(ner_result["entities"], resume_file)
147
+ comparison_result = {
148
+ "Matched Knowledge": matched_knowledge,
149
+ "Unmatched Knowledge": unmatched_knowledge,
150
+ }
151
+ bar_plot = plot_comparison(matched_knowledge, unmatched_knowledge)
152
+
153
+ # Ranking knowledge entities with "In Resume" column
154
+ ranked_knowledge = rank_knowledge(ner_result["entities"], job_posting_text, resume_text)
155
+
156
+ # Generate pie chart for a fixed threshold
157
+ pie_chart = plot_pie_chart(ranked_knowledge, threshold=50)
158
+
159
+ # Convert ranked knowledge to a DataFrame for better display
160
+ ranked_df = pd.DataFrame(ranked_knowledge)
161
+
162
+ return ner_result, ranked_df, bar_plot, pie_chart
163
+
164
+
165
+ # Gradio interface setup
166
  interface = gr.Interface(
167
+ fn=ner_and_compare_with_plot_and_rank,
168
  inputs=[
169
+ gr.Textbox(label="Enter Job Posting Text", lines=20, placeholder="Paste job posting text here..."),
170
+ gr.File(label="Upload a PDF of your resume")
171
  ],
172
  outputs=[
173
+ "highlight", # Highlighted job posting text with extracted entities
174
+ gr.DataFrame(label="Ranked Knowledge"), # Ranked knowledge table
175
+ gr.Image(label="Pie Chart for Terms Above Threshold")
176
+ gr.Image(label="Comparison Chart"), # Bar chart visualization
177
  ],
178
+ title="Resume vs Job Posting Knowledge Match with Highlights and Rankings",
179
+ description="Upload your resume and enter a job posting. The app will highlight key knowledge from the job posting, check if they are present in your resume, visualize the comparison, and rank knowledge terms based on importance.",
180
  )
181
 
182
  # Launch the Gradio app
requirements.txt CHANGED
@@ -2,4 +2,7 @@ torch
2
  transformers
3
  gradio # if using Gradio
4
  streamlit # if using Streamlit
5
- pyPDF2
 
 
 
 
2
  transformers
3
  gradio # if using Gradio
4
  streamlit # if using Streamlit
5
+ pyPDF2
6
+ matplotlib
7
+ PIL
8
+ pandas