final-project / app.py
kylezhao101's picture
Update app.py
ff241cc verified
import gradio as gr
from transformers import pipeline
import PyPDF2
from PIL import Image
import matplotlib.pyplot as plt
from io import BytesIO
import pandas as pd # For displaying rankings in a table
import re
import math
# Load the token classification pipeline
model_name = "jjzha/jobbert_knowledge_extraction"
pipe = pipeline("token-classification", model=model_name, aggregation_strategy="first")
# Aggregate overlapping or adjacent spans into 1 entity
def aggregate_span(results):
new_results = []
current_result = results[0]
for result in results[1:]:
if result["start"] == current_result["end"] + 1:
current_result["word"] += " " + result["word"]
current_result["end"] = result["end"]
else:
new_results.append(current_result)
current_result = result
new_results.append(current_result)
return new_results
# Extract knowledge entities from job posting
def ner(text):
output_knowledge = pipe(text)
for result in output_knowledge:
if result.get("entity_group"):
result["entity"] = "Knowledge"
del result["entity_group"]
if len(output_knowledge) > 0:
output_knowledge = aggregate_span(output_knowledge)
return {"text": text, "entities": output_knowledge}
# Extract text from input PDF
def extract_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
def rank_knowledge(entities, job_posting_text, resume_text):
scores = {}
priority_keywords = ["must-have", "required", "preferred", "key", "important"]
for entity in entities:
term = entity["word"].lower()
term_score = 0
# Count exact matches of the term in the job posting
term_score += len(re.findall(rf'\b{re.escape(term)}\b', job_posting_text.lower()))
# Proximity to priority keywords
term_positions = [m.start() for m in re.finditer(rf'\b{re.escape(term)}\b', job_posting_text.lower())]
for keyword in priority_keywords:
keyword_positions = [m.start() for m in re.finditer(rf'\b{re.escape(keyword)}\b', job_posting_text.lower())]
for t_pos in term_positions:
for k_pos in keyword_positions:
if abs(t_pos - k_pos) < 20: # Within 20 characters
term_score += 1
scores[term] = term_score
# Normalize
max_score = max(scores.values(), default=1)
ranked_entities = [
{
"Term": k,
"Score": (math.log1p(v) / math.log1p(max_score)) * 100, # Log scaling
"In Resume": "Yes" if k in resume_text.lower() else "No"
}
for k, v in scores.items()
]
ranked_entities.sort(key=lambda x: x["Score"], reverse=True)
return ranked_entities
# Compare extracted knowledge entities with the resume
def compare_with_resume(output_knowledge, resume_file):
resume_text = extract_pdf(resume_file) if resume_file else ''
matched_knowledge = []
unmatched_knowledge = []
for entity in output_knowledge:
if entity["word"].lower() in resume_text.lower():
matched_knowledge.append(entity["word"])
else:
unmatched_knowledge.append(entity["word"])
return matched_knowledge, unmatched_knowledge
def plot_comparison(matched_knowledge, unmatched_knowledge):
labels = ['Matched', 'Unmatched']
values = [len(matched_knowledge), len(unmatched_knowledge)]
total = sum(values)
percentages = [f"{(value / total * 100):.1f}%" for value in values]
plt.figure(figsize=(6, 4))
bars = plt.bar(labels, values, color=['green', 'red'])
plt.xlabel('Knowledge Match Status')
plt.ylabel('Count')
plt.title('Knowledge Match Comparison')
plt.tight_layout()
# Add percentage labels above bars
for bar, percentage in zip(bars, percentages):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, percentage, ha='center', fontsize=10)
buf = BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close()
return Image.open(buf)
def plot_pie_chart(ranked_knowledge, threshold=50):
# Filter terms above the threshold
filtered_terms = [term for term in ranked_knowledge if term["Score"] > threshold]
matched_terms = sum(1 for term in filtered_terms if term["In Resume"] == "Yes")
unmatched_terms = len(filtered_terms) - matched_terms
# Data for pie chart
labels = ['Matched', 'Unmatched']
values = [matched_terms, unmatched_terms]
# Create pie chart
plt.figure(figsize=(6, 4))
plt.pie(values, labels=labels, autopct='%1.1f%%', colors=['green', 'red'], startangle=90)
plt.title(f"Terms Above Threshold (Score > {threshold})")
buf = BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close()
return Image.open(buf)
def ner_and_compare_with_plot_and_rank(job_posting_text, resume_file):
"""Combined function to process NER, comparison, ranking, and visualization."""
ner_result = ner(job_posting_text)
resume_text = extract_pdf(resume_file) if resume_file else ''
matched_knowledge, unmatched_knowledge = compare_with_resume(ner_result["entities"], resume_file)
comparison_result = {
"Matched Knowledge": matched_knowledge,
"Unmatched Knowledge": unmatched_knowledge,
}
bar_plot = plot_comparison(matched_knowledge, unmatched_knowledge)
# Ranking knowledge entities with "In Resume" column
ranked_knowledge = rank_knowledge(ner_result["entities"], job_posting_text, resume_text)
# Generate pie chart for a fixed threshold
pie_chart = plot_pie_chart(ranked_knowledge, threshold=50)
# Convert ranked knowledge to a DataFrame for better display
ranked_df = pd.DataFrame(ranked_knowledge)
return ner_result, ranked_df, bar_plot, pie_chart
# Gradio interface setup
interface = gr.Interface(
fn=ner_and_compare_with_plot_and_rank,
inputs=[
gr.Textbox(label="Enter Job Posting Text", lines=20, placeholder="Paste job posting text here..."),
gr.File(label="Upload a PDF of your resume")
],
outputs=[
"highlight", # Highlighted job posting text with extracted entities
gr.DataFrame(label="Ranked Knowledge"), # Ranked knowledge table
gr.Image(label="Pie Chart for Terms Above Threshold"),
gr.Image(label="Comparison Chart"), # Bar chart visualization
],
title="Resume vs Job Posting Knowledge Match with Highlights and Rankings",
description="Upload your resume and enter a job posting. The app will highlight key knowledge from the job posting, check if they are present in your resume, visualize the comparison, and rank knowledge terms based on importance.",
)
# Launch the Gradio app
interface.launch()