Spaces:

IAT360Group29
/

final-project

Sleeping

App Files Files Community

kylezhao101 commited on Dec 8, 2024

Commit

bbed939

1 Parent(s): 2293223

Implement term ranking and plot comparison

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +168 -63
requirements.txt +4 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🏢
 colorFrom: gray
 colorTo: indigo
 sdk: gradio
-sdk_version: 3.32.0
 app_file: app.py
 pinned: false
 short_description: final project for IAT360

 colorFrom: gray
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.8.0
 app_file: app.py
 pinned: false
 short_description: final project for IAT360

app.py CHANGED Viewed

@@ -1,77 +1,182 @@
 import gradio as gr
 from transformers import pipeline
 import PyPDF2
 # Load the token classification pipeline
 model_name = "jjzha/jobbert_knowledge_extraction"
-pipe = pipeline("token-classification", model=model_name)
-# Function to extract and highlight key skills/words from the job posting
-def extract_keywords_with_highlights(job_posting_text):
-    results = pipe(job_posting_text)
-    # Fix the `##` issue by reconstructing full words
-    reconstructed_text = ""
-    highlighted_words = set()
-    previous_end = 0
-    for result in results:
-        start, end, word = result['start'], result['end'], result['word']
-        # Remove `##` for subwords
-        clean_word = word.replace("##", "")
-        highlighted_words.add(clean_word.lower())
-        # Add text before the current word
-        reconstructed_text += job_posting_text[previous_end:start]
-        # Highlight the cleaned word
-        reconstructed_text += (
-            f'<span style="background-color:yellow; font-weight:bold;" '
-            f'title="Entity: {result["entity"]} (Score: {result["score"]:.2f})">'
-            f"{clean_word}</span>"
-        )
-        previous_end = end
-    # Add the remaining text
-    reconstructed_text += job_posting_text[previous_end:]
-    # Replace newline characters with <br> to preserve line breaks
-    reconstructed_text = reconstructed_text.replace("\n", "<br>")
-    return (
-        f'<div style="font-family:Arial, sans-serif; line-height:1.5;">{reconstructed_text}</div>',
-        highlighted_words,
-    )
-# Function to check if highlighted words are in the resume
-def check_keywords_in_resume(resume_file_path, job_posting_text):
-    # Extract text from the uploaded PDF resume
-    with open(resume_file_path, "rb") as file:
-        pdf_reader = PyPDF2.PdfReader(file)
-        resume_text = " ".join(page.extract_text() for page in pdf_reader.pages)
-    # Extract highlighted keywords from the job posting
-    highlighted_html, highlighted_words = extract_keywords_with_highlights(job_posting_text)
-    # Check if each highlighted word is in the resume
-    resume_words = set(resume_text.lower().split())
-    matched_words = highlighted_words.intersection(resume_words)
-    missing_words = highlighted_words - matched_words
-    # Prepare a summary
-    matched_summary = f"Matched Keywords: {', '.join(matched_words)}"
-    missing_summary = f"Missing Keywords: {', '.join(missing_words)}"
-    return highlighted_html, matched_summary, missing_summary
-# Set up Gradio interface
 interface = gr.Interface(
-    fn=check_keywords_in_resume,
     inputs=[
-        gr.File(label="Upload Resume PDF", type="filepath"),
-        gr.Textbox(label="Enter Job Posting Text", lines=30, placeholder="Paste job posting text here..."),
     ],
     outputs=[
-        gr.HTML(label="Highlighted Key Skills/Words in Job Posting"),
-        gr.Textbox(label="Matched Keywords"),
-        gr.Textbox(label="Missing Keywords"),
     ],
-    title="Resume vs Job Posting Skill Match with Highlights",
-    description="Upload your resume and enter a job posting. The app will highlight key skills from the job posting and check if they are present in your resume.",
 )
 # Launch the Gradio app

 import gradio as gr
 from transformers import pipeline
 import PyPDF2
+from PIL import Image
+import matplotlib.pyplot as plt
+from io import BytesIO
+import pandas as pd  # For displaying rankings in a table
+import re
+import math
 # Load the token classification pipeline
 model_name = "jjzha/jobbert_knowledge_extraction"
+pipe = pipeline("token-classification", model=model_name, aggregation_strategy="first")
+# Aggregate overlapping or adjacent spans into 1 entity
+def aggregate_span(results):
+    new_results = []
+    current_result = results[0]
+    for result in results[1:]:
+        if result["start"] == current_result["end"] + 1:
+            current_result["word"] += " " + result["word"]
+            current_result["end"] = result["end"]
+        else:
+            new_results.append(current_result)
+            current_result = result
+    new_results.append(current_result)
+    return new_results
+# Extract knowledge entities from job posting
+def ner(text):
+    output_knowledge = pipe(text)
+    for result in output_knowledge:
+        if result.get("entity_group"):
+            result["entity"] = "Knowledge"
+            del result["entity_group"]
+    if len(output_knowledge) > 0:
+        output_knowledge = aggregate_span(output_knowledge)
+    return {"text": text, "entities": output_knowledge}
+# Extract text from input PDF
+def extract_pdf(pdf_file):
+    reader = PyPDF2.PdfReader(pdf_file)
+    text = ''
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+def rank_knowledge(entities, job_posting_text, resume_text):
+    scores = {}
+    priority_keywords = ["must-have", "required", "preferred", "key", "important"]
+    for entity in entities:
+        term = entity["word"].lower()
+        term_score = 0
+        # Count exact matches of the term in the job posting
+        term_score += len(re.findall(rf'\b{re.escape(term)}\b', job_posting_text.lower()))
+        # Proximity to priority keywords
+        term_positions = [m.start() for m in re.finditer(rf'\b{re.escape(term)}\b', job_posting_text.lower())]
+        for keyword in priority_keywords:
+            keyword_positions = [m.start() for m in re.finditer(rf'\b{re.escape(keyword)}\b', job_posting_text.lower())]
+            for t_pos in term_positions:
+                for k_pos in keyword_positions:
+                    if abs(t_pos - k_pos) < 20:  # Within 20 characters
+                        term_score += 1
+        scores[term] = term_score
+    # Normalize
+    max_score = max(scores.values(), default=1)
+    ranked_entities = [
+        {
+            "Term": k,
+            "Score": (math.log1p(v) / math.log1p(max_score)) * 100,  # Log scaling
+            "In Resume": "Yes" if k in resume_text.lower() else "No"
+        }
+        for k, v in scores.items()
+    ]
+    ranked_entities.sort(key=lambda x: x["Score"], reverse=True)
+    return ranked_entities
+# Compare extracted knowledge entities with the resume
+def compare_with_resume(output_knowledge, resume_file):
+    resume_text = extract_pdf(resume_file) if resume_file else ''
+    matched_knowledge = []
+    unmatched_knowledge = []
+    for entity in output_knowledge:
+        if entity["word"].lower() in resume_text.lower():
+            matched_knowledge.append(entity["word"])
+        else:
+            unmatched_knowledge.append(entity["word"])
+    return matched_knowledge, unmatched_knowledge
+def plot_comparison(matched_knowledge, unmatched_knowledge):
+    labels = ['Matched', 'Unmatched']
+    values = [len(matched_knowledge), len(unmatched_knowledge)]
+    total = sum(values)
+    percentages = [f"{(value / total * 100):.1f}%" for value in values]
+    plt.figure(figsize=(6, 4))
+    bars = plt.bar(labels, values, color=['green', 'red'])
+    plt.xlabel('Knowledge Match Status')
+    plt.ylabel('Count')
+    plt.title('Knowledge Match Comparison')
+    plt.tight_layout()
+    # Add percentage labels above bars
+    for bar, percentage in zip(bars, percentages):
+        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, percentage, ha='center', fontsize=10)
+    buf = BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    plt.close()
+    return Image.open(buf)
+def plot_pie_chart(ranked_knowledge, threshold=50):
+    # Filter terms above the threshold
+    filtered_terms = [term for term in ranked_knowledge if term["Score"] >= threshold]
+    matched_terms = sum(1 for term in filtered_terms if term["In Resume"] == "Yes")
+    unmatched_terms = len(filtered_terms) - matched_terms
+    # Data for pie chart
+    labels = ['Matched', 'Unmatched']
+    values = [matched_terms, unmatched_terms]
+    # Create pie chart
+    plt.figure(figsize=(6, 4))
+    plt.pie(values, labels=labels, autopct='%1.1f%%', colors=['green', 'red'], startangle=90)
+    plt.title(f"Terms Above Threshold (Score >= {threshold})")
+    buf = BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    plt.close()
+    return Image.open(buf)
+def ner_and_compare_with_plot_and_rank(job_posting_text, resume_file):
+    """Combined function to process NER, comparison, ranking, and visualization."""
+    ner_result = ner(job_posting_text)
+    resume_text = extract_pdf(resume_file) if resume_file else ''
+    matched_knowledge, unmatched_knowledge = compare_with_resume(ner_result["entities"], resume_file)
+    comparison_result = {
+        "Matched Knowledge": matched_knowledge,
+        "Unmatched Knowledge": unmatched_knowledge,
+    }
+    bar_plot = plot_comparison(matched_knowledge, unmatched_knowledge)
+    # Ranking knowledge entities with "In Resume" column
+    ranked_knowledge = rank_knowledge(ner_result["entities"], job_posting_text, resume_text)
+    # Generate pie chart for a fixed threshold
+    pie_chart = plot_pie_chart(ranked_knowledge, threshold=50)
+    # Convert ranked knowledge to a DataFrame for better display
+    ranked_df = pd.DataFrame(ranked_knowledge)
+    return ner_result, ranked_df, bar_plot, pie_chart
+# Gradio interface setup
 interface = gr.Interface(
+    fn=ner_and_compare_with_plot_and_rank,
     inputs=[
+        gr.Textbox(label="Enter Job Posting Text", lines=20, placeholder="Paste job posting text here..."),
+        gr.File(label="Upload a PDF of your resume")
     ],
     outputs=[
+        "highlight",  # Highlighted job posting text with extracted entities
+        gr.DataFrame(label="Ranked Knowledge"),  # Ranked knowledge table
+        gr.Image(label="Pie Chart for Terms Above Threshold")
+        gr.Image(label="Comparison Chart"),  # Bar chart visualization
     ],
+    title="Resume vs Job Posting Knowledge Match with Highlights and Rankings",
+    description="Upload your resume and enter a job posting. The app will highlight key knowledge from the job posting, check if they are present in your resume, visualize the comparison, and rank knowledge terms based on importance.",
 )
 # Launch the Gradio app

requirements.txt CHANGED Viewed

@@ -2,4 +2,7 @@ torch
 transformers
 gradio  # if using Gradio
 streamlit  # if using Streamlit
-pyPDF2

 transformers
 gradio  # if using Gradio
 streamlit  # if using Streamlit
+pyPDF2
+matplotlib
+PIL
+pandas