File size: 6,884 Bytes
09b06d4
0f51a7f
09b06d4
bbed939
 
 
 
 
 
6137008
0f51a7f
 
bbed939
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff241cc
bbed939
 
 
 
 
 
 
 
 
 
ff241cc
bbed939
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09b06d4
bbed939
09b06d4
bbed939
 
09b06d4
 
bbed939
 
2ccd029
bbed939
09b06d4
bbed939
 
09b06d4
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import gradio as gr
from transformers import pipeline
import PyPDF2
from PIL import Image
import matplotlib.pyplot as plt
from io import BytesIO
import pandas as pd  # For displaying rankings in a table
import re
import math

# Load the token classification pipeline
model_name = "jjzha/jobbert_knowledge_extraction"
pipe = pipeline("token-classification", model=model_name, aggregation_strategy="first")

# Aggregate overlapping or adjacent spans into 1 entity
def aggregate_span(results):
    new_results = []
    current_result = results[0]
    for result in results[1:]:
        if result["start"] == current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result
    new_results.append(current_result)
    return new_results

# Extract knowledge entities from job posting
def ner(text):
    output_knowledge = pipe(text)
    for result in output_knowledge:
        if result.get("entity_group"):
            result["entity"] = "Knowledge"
            del result["entity_group"]
    if len(output_knowledge) > 0:
        output_knowledge = aggregate_span(output_knowledge)

    return {"text": text, "entities": output_knowledge}

# Extract text from input PDF
def extract_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return text

def rank_knowledge(entities, job_posting_text, resume_text):
    scores = {}
    priority_keywords = ["must-have", "required", "preferred", "key", "important"]

    for entity in entities:
        term = entity["word"].lower()
        term_score = 0

        # Count exact matches of the term in the job posting
        term_score += len(re.findall(rf'\b{re.escape(term)}\b', job_posting_text.lower()))

        # Proximity to priority keywords
        term_positions = [m.start() for m in re.finditer(rf'\b{re.escape(term)}\b', job_posting_text.lower())]
        for keyword in priority_keywords:
            keyword_positions = [m.start() for m in re.finditer(rf'\b{re.escape(keyword)}\b', job_posting_text.lower())]
            for t_pos in term_positions:
                for k_pos in keyword_positions:
                    if abs(t_pos - k_pos) < 20:  # Within 20 characters
                        term_score += 1

        scores[term] = term_score

    # Normalize
    max_score = max(scores.values(), default=1)
    ranked_entities = [
        {
            "Term": k,
            "Score": (math.log1p(v) / math.log1p(max_score)) * 100,  # Log scaling
            "In Resume": "Yes" if k in resume_text.lower() else "No"
        }
        for k, v in scores.items()
    ]

    ranked_entities.sort(key=lambda x: x["Score"], reverse=True)
    return ranked_entities
    
# Compare extracted knowledge entities with the resume
def compare_with_resume(output_knowledge, resume_file):
    resume_text = extract_pdf(resume_file) if resume_file else ''
    matched_knowledge = []
    unmatched_knowledge = []

    for entity in output_knowledge:
        if entity["word"].lower() in resume_text.lower():
            matched_knowledge.append(entity["word"])
        else:
            unmatched_knowledge.append(entity["word"])
    return matched_knowledge, unmatched_knowledge

def plot_comparison(matched_knowledge, unmatched_knowledge):
    labels = ['Matched', 'Unmatched']
    values = [len(matched_knowledge), len(unmatched_knowledge)]
    total = sum(values)
    percentages = [f"{(value / total * 100):.1f}%" for value in values]

    plt.figure(figsize=(6, 4))
    bars = plt.bar(labels, values, color=['green', 'red'])
    plt.xlabel('Knowledge Match Status')
    plt.ylabel('Count')
    plt.title('Knowledge Match Comparison')
    plt.tight_layout()

    # Add percentage labels above bars
    for bar, percentage in zip(bars, percentages):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, percentage, ha='center', fontsize=10)

    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    plt.close()

    return Image.open(buf)

def plot_pie_chart(ranked_knowledge, threshold=50):
    # Filter terms above the threshold
    filtered_terms = [term for term in ranked_knowledge if term["Score"] > threshold]
    matched_terms = sum(1 for term in filtered_terms if term["In Resume"] == "Yes")
    unmatched_terms = len(filtered_terms) - matched_terms

    # Data for pie chart
    labels = ['Matched', 'Unmatched']
    values = [matched_terms, unmatched_terms]

    # Create pie chart
    plt.figure(figsize=(6, 4))
    plt.pie(values, labels=labels, autopct='%1.1f%%', colors=['green', 'red'], startangle=90)
    plt.title(f"Terms Above Threshold (Score > {threshold})")
    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    plt.close()
    return Image.open(buf)

def ner_and_compare_with_plot_and_rank(job_posting_text, resume_file):
    """Combined function to process NER, comparison, ranking, and visualization."""
    ner_result = ner(job_posting_text)
    resume_text = extract_pdf(resume_file) if resume_file else ''
    matched_knowledge, unmatched_knowledge = compare_with_resume(ner_result["entities"], resume_file)
    comparison_result = {
        "Matched Knowledge": matched_knowledge,
        "Unmatched Knowledge": unmatched_knowledge,
    }
    bar_plot = plot_comparison(matched_knowledge, unmatched_knowledge)

    # Ranking knowledge entities with "In Resume" column
    ranked_knowledge = rank_knowledge(ner_result["entities"], job_posting_text, resume_text)

    # Generate pie chart for a fixed threshold
    pie_chart = plot_pie_chart(ranked_knowledge, threshold=50)

    # Convert ranked knowledge to a DataFrame for better display
    ranked_df = pd.DataFrame(ranked_knowledge)

    return ner_result, ranked_df, bar_plot, pie_chart


# Gradio interface setup
interface = gr.Interface(
    fn=ner_and_compare_with_plot_and_rank,
    inputs=[
        gr.Textbox(label="Enter Job Posting Text", lines=20, placeholder="Paste job posting text here..."),
        gr.File(label="Upload a PDF of your resume")
    ],
    outputs=[
        "highlight",  # Highlighted job posting text with extracted entities
        gr.DataFrame(label="Ranked Knowledge"),  # Ranked knowledge table
        gr.Image(label="Pie Chart for Terms Above Threshold"),
        gr.Image(label="Comparison Chart"),  # Bar chart visualization       
    ],
    title="Resume vs Job Posting Knowledge Match with Highlights and Rankings",
    description="Upload your resume and enter a job posting. The app will highlight key knowledge from the job posting, check if they are present in your resume, visualize the comparison, and rank knowledge terms based on importance.",
)

# Launch the Gradio app
interface.launch()