Spaces:
Sleeping
Sleeping
File size: 6,884 Bytes
09b06d4 0f51a7f 09b06d4 bbed939 6137008 0f51a7f bbed939 ff241cc bbed939 ff241cc bbed939 09b06d4 bbed939 09b06d4 bbed939 09b06d4 bbed939 2ccd029 bbed939 09b06d4 bbed939 09b06d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import gradio as gr
from transformers import pipeline
import PyPDF2
from PIL import Image
import matplotlib.pyplot as plt
from io import BytesIO
import pandas as pd # For displaying rankings in a table
import re
import math
# Load the token classification pipeline
model_name = "jjzha/jobbert_knowledge_extraction"
pipe = pipeline("token-classification", model=model_name, aggregation_strategy="first")
# Aggregate overlapping or adjacent spans into 1 entity
def aggregate_span(results):
new_results = []
current_result = results[0]
for result in results[1:]:
if result["start"] == current_result["end"] + 1:
current_result["word"] += " " + result["word"]
current_result["end"] = result["end"]
else:
new_results.append(current_result)
current_result = result
new_results.append(current_result)
return new_results
# Extract knowledge entities from job posting
def ner(text):
output_knowledge = pipe(text)
for result in output_knowledge:
if result.get("entity_group"):
result["entity"] = "Knowledge"
del result["entity_group"]
if len(output_knowledge) > 0:
output_knowledge = aggregate_span(output_knowledge)
return {"text": text, "entities": output_knowledge}
# Extract text from input PDF
def extract_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
def rank_knowledge(entities, job_posting_text, resume_text):
scores = {}
priority_keywords = ["must-have", "required", "preferred", "key", "important"]
for entity in entities:
term = entity["word"].lower()
term_score = 0
# Count exact matches of the term in the job posting
term_score += len(re.findall(rf'\b{re.escape(term)}\b', job_posting_text.lower()))
# Proximity to priority keywords
term_positions = [m.start() for m in re.finditer(rf'\b{re.escape(term)}\b', job_posting_text.lower())]
for keyword in priority_keywords:
keyword_positions = [m.start() for m in re.finditer(rf'\b{re.escape(keyword)}\b', job_posting_text.lower())]
for t_pos in term_positions:
for k_pos in keyword_positions:
if abs(t_pos - k_pos) < 20: # Within 20 characters
term_score += 1
scores[term] = term_score
# Normalize
max_score = max(scores.values(), default=1)
ranked_entities = [
{
"Term": k,
"Score": (math.log1p(v) / math.log1p(max_score)) * 100, # Log scaling
"In Resume": "Yes" if k in resume_text.lower() else "No"
}
for k, v in scores.items()
]
ranked_entities.sort(key=lambda x: x["Score"], reverse=True)
return ranked_entities
# Compare extracted knowledge entities with the resume
def compare_with_resume(output_knowledge, resume_file):
resume_text = extract_pdf(resume_file) if resume_file else ''
matched_knowledge = []
unmatched_knowledge = []
for entity in output_knowledge:
if entity["word"].lower() in resume_text.lower():
matched_knowledge.append(entity["word"])
else:
unmatched_knowledge.append(entity["word"])
return matched_knowledge, unmatched_knowledge
def plot_comparison(matched_knowledge, unmatched_knowledge):
labels = ['Matched', 'Unmatched']
values = [len(matched_knowledge), len(unmatched_knowledge)]
total = sum(values)
percentages = [f"{(value / total * 100):.1f}%" for value in values]
plt.figure(figsize=(6, 4))
bars = plt.bar(labels, values, color=['green', 'red'])
plt.xlabel('Knowledge Match Status')
plt.ylabel('Count')
plt.title('Knowledge Match Comparison')
plt.tight_layout()
# Add percentage labels above bars
for bar, percentage in zip(bars, percentages):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, percentage, ha='center', fontsize=10)
buf = BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close()
return Image.open(buf)
def plot_pie_chart(ranked_knowledge, threshold=50):
# Filter terms above the threshold
filtered_terms = [term for term in ranked_knowledge if term["Score"] > threshold]
matched_terms = sum(1 for term in filtered_terms if term["In Resume"] == "Yes")
unmatched_terms = len(filtered_terms) - matched_terms
# Data for pie chart
labels = ['Matched', 'Unmatched']
values = [matched_terms, unmatched_terms]
# Create pie chart
plt.figure(figsize=(6, 4))
plt.pie(values, labels=labels, autopct='%1.1f%%', colors=['green', 'red'], startangle=90)
plt.title(f"Terms Above Threshold (Score > {threshold})")
buf = BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close()
return Image.open(buf)
def ner_and_compare_with_plot_and_rank(job_posting_text, resume_file):
"""Combined function to process NER, comparison, ranking, and visualization."""
ner_result = ner(job_posting_text)
resume_text = extract_pdf(resume_file) if resume_file else ''
matched_knowledge, unmatched_knowledge = compare_with_resume(ner_result["entities"], resume_file)
comparison_result = {
"Matched Knowledge": matched_knowledge,
"Unmatched Knowledge": unmatched_knowledge,
}
bar_plot = plot_comparison(matched_knowledge, unmatched_knowledge)
# Ranking knowledge entities with "In Resume" column
ranked_knowledge = rank_knowledge(ner_result["entities"], job_posting_text, resume_text)
# Generate pie chart for a fixed threshold
pie_chart = plot_pie_chart(ranked_knowledge, threshold=50)
# Convert ranked knowledge to a DataFrame for better display
ranked_df = pd.DataFrame(ranked_knowledge)
return ner_result, ranked_df, bar_plot, pie_chart
# Gradio interface setup
interface = gr.Interface(
fn=ner_and_compare_with_plot_and_rank,
inputs=[
gr.Textbox(label="Enter Job Posting Text", lines=20, placeholder="Paste job posting text here..."),
gr.File(label="Upload a PDF of your resume")
],
outputs=[
"highlight", # Highlighted job posting text with extracted entities
gr.DataFrame(label="Ranked Knowledge"), # Ranked knowledge table
gr.Image(label="Pie Chart for Terms Above Threshold"),
gr.Image(label="Comparison Chart"), # Bar chart visualization
],
title="Resume vs Job Posting Knowledge Match with Highlights and Rankings",
description="Upload your resume and enter a job posting. The app will highlight key knowledge from the job posting, check if they are present in your resume, visualize the comparison, and rank knowledge terms based on importance.",
)
# Launch the Gradio app
interface.launch()
|