|
import streamlit as st |
|
import pdfplumber |
|
import pandas as pd |
|
import numpy as np |
|
import torch |
|
import nltk |
|
import faiss |
|
import os |
|
import tempfile |
|
import base64 |
|
from rank_bm25 import BM25Okapi |
|
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig |
|
from sentence_transformers import SentenceTransformer |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
from tqdm import tqdm |
|
import re |
|
import io |
|
import PyPDF2 |
|
from docx import Document |
|
import csv |
|
from explanation_generator import ExplanationGenerator |
|
|
|
|
|
try: |
|
nltk.data.find('tokenizers/punkt') |
|
except LookupError: |
|
nltk.download('punkt') |
|
|
|
|
|
EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2" |
|
print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...") |
|
|
|
try: |
|
|
|
quantization_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
bnb_4bit_use_double_quant=True |
|
) |
|
|
|
|
|
global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True) |
|
global_embedding_model = AutoModel.from_pretrained( |
|
EMBEDDING_MODEL_NAME, |
|
trust_remote_code=True, |
|
device_map="auto", |
|
quantization_config=quantization_config, |
|
torch_dtype=torch.float16 |
|
) |
|
print(f"Successfully loaded {EMBEDDING_MODEL_NAME} with 4-bit quantization") |
|
except Exception as e: |
|
print(f"Error loading embedding model: {str(e)}") |
|
global_embedding_tokenizer = None |
|
global_embedding_model = None |
|
|
|
|
|
st.set_page_config( |
|
page_title="Resume Screener & Skill Extractor", |
|
page_icon="📄", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
|
|
with st.sidebar: |
|
st.title("Configuration") |
|
|
|
|
|
embedding_model_name = st.selectbox( |
|
"Embedding Model", |
|
["nvidia/NV-Embed-v2"], |
|
index=0 |
|
) |
|
|
|
explanation_model_name = st.selectbox( |
|
"Explanation Model", |
|
["Qwen/QwQ-32B"], |
|
index=0 |
|
) |
|
|
|
|
|
st.subheader("Ranking Weights") |
|
semantic_weight = st.slider("Semantic Similarity Weight", 0.0, 1.0, 0.7, 0.1) |
|
keyword_weight = 1.0 - semantic_weight |
|
st.write(f"Keyword Weight: {keyword_weight:.1f}") |
|
|
|
|
|
st.subheader("Advanced Options") |
|
top_k = st.number_input("Number of results to display", min_value=1, max_value=20, value=10, step=1) |
|
use_explanation = st.checkbox("Generate Explanations", value=True) |
|
use_faiss = st.checkbox("Use FAISS for fast search", value=True) |
|
|
|
|
|
st.subheader("Memory Optimization") |
|
memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False) |
|
clear_embeddings = st.checkbox("Clear embeddings after processing", value=False) |
|
gc_collect_interval = st.number_input( |
|
"Garbage collection interval (files)", |
|
min_value=10, |
|
max_value=1000, |
|
value=100, |
|
step=10, |
|
help="Run garbage collection after processing this many files" |
|
) |
|
|
|
st.markdown("---") |
|
st.markdown("### About") |
|
st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.") |
|
|
|
|
|
if 'resumes_uploaded' not in st.session_state: |
|
st.session_state.resumes_uploaded = False |
|
if 'job_description' not in st.session_state: |
|
st.session_state.job_description = "" |
|
if 'results' not in st.session_state: |
|
st.session_state.results = [] |
|
if 'embedding_model' not in st.session_state: |
|
st.session_state.embedding_model = global_embedding_model |
|
if 'tokenizer' not in st.session_state: |
|
st.session_state.tokenizer = global_embedding_tokenizer |
|
if 'faiss_index' not in st.session_state: |
|
st.session_state.faiss_index = None |
|
if 'explanation_generator' not in st.session_state: |
|
st.session_state.explanation_generator = None |
|
|
|
class ResumeScreener: |
|
def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"): |
|
"""Initialize the ResumeScreener with the specified embedding model""" |
|
self.embedding_model_name = embedding_model_name |
|
self.explanation_model_name = explanation_model_name |
|
|
|
self.model = st.session_state.embedding_model |
|
self.tokenizer = st.session_state.tokenizer |
|
self.faiss_index = None |
|
self.embedding_size = None |
|
self.explanation_generator = None |
|
|
|
|
|
if use_explanation and st.session_state.explanation_generator is None: |
|
with st.spinner("Initializing explanation generator..."): |
|
st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name) |
|
self.explanation_generator = st.session_state.explanation_generator |
|
elif use_explanation: |
|
self.explanation_generator = st.session_state.explanation_generator |
|
|
|
def extract_text_from_file(self, file, file_type): |
|
"""Extract text from various file types""" |
|
try: |
|
if file_type == "pdf": |
|
|
|
with pdfplumber.open(file) as pdf: |
|
text = "" |
|
for page in pdf.pages: |
|
text += page.extract_text() or "" |
|
|
|
|
|
if not text.strip(): |
|
reader = PyPDF2.PdfReader(file) |
|
text = "" |
|
for page_num in range(len(reader.pages)): |
|
page = reader.pages[page_num] |
|
text += page.extract_text() or "" |
|
|
|
return text |
|
|
|
elif file_type == "docx": |
|
doc = Document(file) |
|
return " ".join([paragraph.text for paragraph in doc.paragraphs]) |
|
|
|
elif file_type == "txt": |
|
return file.read().decode("utf-8") |
|
|
|
elif file_type == "csv": |
|
csv_text = "" |
|
csv_reader = csv.reader(io.StringIO(file.read().decode("utf-8"))) |
|
for row in csv_reader: |
|
csv_text += " ".join(row) + " " |
|
return csv_text |
|
|
|
else: |
|
st.error(f"Unsupported file type: {file_type}") |
|
return "" |
|
|
|
except Exception as e: |
|
st.error(f"Error extracting text from file: {str(e)}") |
|
return "" |
|
|
|
def get_embedding(self, text): |
|
"""Generate text embedding for a given text""" |
|
if self.model is None: |
|
st.error("Embedding model not available. Please check your environment.") |
|
return np.zeros(768) |
|
|
|
try: |
|
|
|
max_length = 256 |
|
|
|
|
|
inputs = self.tokenizer( |
|
text, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=max_length, |
|
padding=True |
|
) |
|
|
|
|
|
device = next(self.model.parameters()).device |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
torch.cuda.empty_cache() |
|
|
|
with torch.no_grad(): |
|
outputs = self.model(**inputs) |
|
|
|
|
|
if hasattr(outputs, "last_hidden_state"): |
|
|
|
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze() |
|
embedding_np = embeddings.detach().cpu().numpy() |
|
|
|
|
|
if self.embedding_size is None: |
|
self.embedding_size = embedding_np.shape[0] |
|
|
|
|
|
del outputs, embeddings |
|
torch.cuda.empty_cache() |
|
|
|
return embedding_np |
|
else: |
|
|
|
embedding_np = outputs.detach().cpu().numpy() |
|
|
|
|
|
if self.embedding_size is None: |
|
self.embedding_size = embedding_np.shape[0] |
|
|
|
|
|
del outputs |
|
torch.cuda.empty_cache() |
|
|
|
return embedding_np |
|
except Exception as e: |
|
st.error(f"Error generating embedding: {str(e)}") |
|
torch.cuda.empty_cache() |
|
return np.zeros(768) |
|
|
|
def create_faiss_index(self, embeddings): |
|
"""Create a FAISS index for fast similarity search""" |
|
|
|
dimension = embeddings[0].shape[0] |
|
|
|
|
|
index = faiss.IndexFlatIP(dimension) |
|
|
|
|
|
embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings]) |
|
index.add(embeddings_normalized) |
|
|
|
return index |
|
|
|
def query_faiss_index(self, index, query_embedding, k=10): |
|
"""Query the FAISS index with a query embedding""" |
|
|
|
query_embedding = query_embedding / np.linalg.norm(query_embedding) |
|
|
|
|
|
if len(query_embedding.shape) == 1: |
|
query_embedding = query_embedding.reshape(1, -1) |
|
|
|
|
|
scores, indices = index.search(query_embedding, k) |
|
|
|
return scores[0], indices[0] |
|
|
|
def calculate_bm25_scores(self, resume_texts, job_description): |
|
"""Calculate BM25 scores for keyword matching""" |
|
|
|
job_tokens = word_tokenize(job_description.lower()) |
|
|
|
|
|
corpus = [word_tokenize(resume.lower()) for resume in resume_texts] |
|
|
|
|
|
bm25 = BM25Okapi(corpus) |
|
|
|
|
|
scores = bm25.get_scores(job_tokens) |
|
|
|
return scores |
|
|
|
def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True): |
|
"""Calculate hybrid scores combining semantic similarity and BM25""" |
|
|
|
if use_faiss and len(resume_embeddings) > 10: |
|
|
|
if st.session_state.faiss_index is None: |
|
index = self.create_faiss_index(resume_embeddings) |
|
st.session_state.faiss_index = index |
|
else: |
|
index = st.session_state.faiss_index |
|
|
|
|
|
faiss_scores, faiss_indices = self.query_faiss_index(index, job_embedding, k=len(resume_embeddings)) |
|
|
|
|
|
semantic_scores = np.zeros(len(resume_embeddings)) |
|
for i, idx in enumerate(faiss_indices): |
|
if idx < len(resume_embeddings): |
|
semantic_scores[idx] = faiss_scores[i] |
|
else: |
|
|
|
semantic_scores = [] |
|
for emb in resume_embeddings: |
|
|
|
emb_norm = emb / np.linalg.norm(emb) |
|
job_emb_norm = job_embedding / np.linalg.norm(job_embedding) |
|
|
|
|
|
similarity = np.dot(emb_norm, job_emb_norm) |
|
semantic_scores.append(similarity) |
|
|
|
|
|
bm25_scores = self.calculate_bm25_scores(resume_texts, job_description) |
|
|
|
|
|
if max(bm25_scores) > 0: |
|
bm25_scores = [score / max(bm25_scores) for score in bm25_scores] |
|
|
|
|
|
keyword_weight = 1.0 - semantic_weight |
|
hybrid_scores = [ |
|
(semantic_weight * sem_score) + (keyword_weight * bm25_score) |
|
for sem_score, bm25_score in zip(semantic_scores, bm25_scores) |
|
] |
|
|
|
return hybrid_scores, semantic_scores, bm25_scores |
|
|
|
def extract_skills(self, text, job_description): |
|
"""Extract skills from text based on job description""" |
|
|
|
|
|
|
|
|
|
potential_skills = set() |
|
|
|
|
|
skill_indicators = ["experience with", "knowledge of", "familiar with", "proficient in", |
|
"skills in", "expertise in", "background in", "capabilities in", |
|
"years of experience in", "understanding of", "trained in"] |
|
|
|
|
|
sentences = sent_tokenize(job_description) |
|
for sentence in sentences: |
|
sentence_lower = sentence.lower() |
|
for indicator in skill_indicators: |
|
if indicator in sentence_lower: |
|
|
|
skills_part = sentence_lower.split(indicator, 1)[1] |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part) |
|
for word in words: |
|
if len(word) >= 3: |
|
potential_skills.add(word.lower()) |
|
|
|
|
|
skill_lists = re.findall(r'(?:skills|requirements|qualifications)[^\n.]*?:(.+?)(?:\n|$)', job_description.lower()) |
|
for skill_list in skill_lists: |
|
words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list) |
|
for word in words: |
|
if len(word) >= 3: |
|
potential_skills.add(word.lower()) |
|
|
|
|
|
common_tech_skills = ["python", "java", "c++", "javascript", "sql", "react", "node.js", "typescript", |
|
"html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "terraform", |
|
"git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"] |
|
|
|
for skill in common_tech_skills: |
|
if skill in job_description.lower(): |
|
potential_skills.add(skill) |
|
|
|
|
|
matched_skills = [] |
|
for skill in potential_skills: |
|
|
|
pattern = r'\b' + re.escape(skill) + r'\b' |
|
matches = re.findall(pattern, text.lower()) |
|
if matches: |
|
matched_skills.append(skill) |
|
|
|
return list(set(matched_skills)) |
|
|
|
def extract_key_phrases(self, text, job_description): |
|
"""Extract key phrases from text that match job description keywords""" |
|
|
|
skills = self.extract_skills(job_description, job_description) |
|
|
|
|
|
sentences = sent_tokenize(text) |
|
skill_sentences = [] |
|
|
|
for sentence in sentences: |
|
sentence_lower = sentence.lower() |
|
for skill in skills: |
|
if skill in sentence_lower: |
|
|
|
highlighted = sentence.replace(skill, f"**{skill}**") |
|
skill_sentences.append(highlighted) |
|
break |
|
|
|
|
|
if len(skill_sentences) < 5: |
|
|
|
job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3) |
|
text_tokens = word_tokenize(text) |
|
|
|
matches = [] |
|
for i, token in enumerate(text_tokens): |
|
if token.lower() in job_tokens: |
|
|
|
start = max(0, i - 5) |
|
end = min(len(text_tokens), i + 6) |
|
phrase = " ".join(text_tokens[start:end]) |
|
matches.append(phrase) |
|
|
|
|
|
unique_matches = list(set(matches)) |
|
skill_sentences.extend(unique_matches[:5 - len(skill_sentences)]) |
|
|
|
|
|
return skill_sentences[:5] |
|
|
|
def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills): |
|
"""Generate explanation for why a resume was ranked highly using QwQ-32B model""" |
|
|
|
if use_explanation and self.explanation_generator: |
|
return self.explanation_generator.generate_explanation( |
|
resume_text, |
|
job_description, |
|
score, |
|
semantic_score, |
|
bm25_score, |
|
skills |
|
) |
|
else: |
|
|
|
matching_phrases = self.extract_key_phrases(resume_text, job_description) |
|
|
|
explanation = f"This resume received a score of {score:.2f}, with semantic relevance of {semantic_score:.2f} and keyword match of {bm25_score:.2f}. " |
|
|
|
if skills: |
|
explanation += f"The resume shows experience with key skills: {', '.join(skills[:5])}. " |
|
|
|
if matching_phrases: |
|
explanation += f"Key matching elements include: {matching_phrases[0]}" |
|
|
|
return explanation |
|
|
|
|
|
def get_csv_download_link(df, filename="results.csv"): |
|
csv = df.to_csv(index=False) |
|
b64 = base64.b64encode(csv.encode()).decode() |
|
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>' |
|
return href |
|
|
|
|
|
def get_huggingface_spaces_datasets(): |
|
"""Check for datasets in Hugging Face Spaces environment""" |
|
datasets = [] |
|
|
|
|
|
potential_paths = [ |
|
"/data", |
|
"data", |
|
os.path.expanduser("~/data"), |
|
] |
|
|
|
for path in potential_paths: |
|
if os.path.exists(path) and os.path.isdir(path): |
|
|
|
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')] |
|
for csv_file in csv_files: |
|
datasets.append(os.path.join(path, csv_file)) |
|
|
|
|
|
for subdir in os.listdir(path): |
|
subdir_path = os.path.join(path, subdir) |
|
if os.path.isdir(subdir_path): |
|
pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')]) |
|
if pdf_count > 0: |
|
datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)")) |
|
|
|
return datasets |
|
|
|
|
|
st.title("Resume Screener & Skill Extractor") |
|
st.markdown("---") |
|
|
|
|
|
screener = ResumeScreener(embedding_model_name, explanation_model_name) |
|
|
|
|
|
st.header("1. Enter Job Description") |
|
job_description = st.text_area( |
|
"Paste the job description or requirements here:", |
|
height=200, |
|
help="Enter the complete job description or a list of required skills and qualifications." |
|
) |
|
|
|
|
|
st.header("2. Upload Resumes") |
|
upload_option = st.radio( |
|
"Choose upload method:", |
|
["Upload Files", "Upload from Dataset", "Process Directory"] |
|
) |
|
|
|
uploaded_files = [] |
|
resume_texts = [] |
|
file_names = [] |
|
|
|
if upload_option == "Upload Files": |
|
uploaded_files = st.file_uploader( |
|
"Upload resume files", |
|
type=["pdf", "docx", "txt", "csv"], |
|
accept_multiple_files=True, |
|
help="Upload multiple resume files in PDF, DOCX, TXT, or CSV format." |
|
) |
|
|
|
if uploaded_files: |
|
with st.spinner("Processing resumes..."): |
|
for file in uploaded_files: |
|
file_type = file.name.split('.')[-1].lower() |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file: |
|
tmp_file.write(file.getvalue()) |
|
tmp_path = tmp_file.name |
|
|
|
text = screener.extract_text_from_file(tmp_path, file_type) |
|
if text: |
|
resume_texts.append(text) |
|
file_names.append(file.name) |
|
|
|
|
|
os.unlink(tmp_path) |
|
|
|
st.session_state.resumes_uploaded = True |
|
st.success(f"Successfully processed {len(resume_texts)} resumes.") |
|
elif upload_option == "Process Directory": |
|
st.write("Process resume files from a directory on the server.") |
|
|
|
|
|
resume_dir = st.text_input( |
|
"Enter the path to the directory containing resume files:", |
|
help="For Hugging Face Spaces, this could be a mounted directory or dataset." |
|
) |
|
|
|
|
|
batch_size = st.number_input( |
|
"Number of files to process per batch (lower for less memory usage):", |
|
min_value=10, |
|
max_value=1000, |
|
value=100, |
|
step=10 |
|
) |
|
|
|
|
|
file_types = st.multiselect( |
|
"Select file types to process:", |
|
["pdf", "docx", "txt", "csv"], |
|
default=["pdf"] |
|
) |
|
|
|
if resume_dir and st.button("Process Directory"): |
|
if os.path.isdir(resume_dir): |
|
|
|
all_files = [] |
|
for file_type in file_types: |
|
all_files.extend([ |
|
os.path.join(resume_dir, f) |
|
for f in os.listdir(resume_dir) |
|
if f.lower().endswith(f'.{file_type}') |
|
]) |
|
|
|
if all_files: |
|
total_files = len(all_files) |
|
st.write(f"Found {total_files} files. Processing in batches of {batch_size}...") |
|
|
|
|
|
processed_count = 0 |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
|
|
for i in range(0, total_files, batch_size): |
|
batch_files = all_files[i:i+batch_size] |
|
|
|
for j, file_path in enumerate(batch_files): |
|
try: |
|
file_type = file_path.split('.')[-1].lower() |
|
text = screener.extract_text_from_file(file_path, file_type) |
|
if text: |
|
resume_texts.append(text) |
|
file_names.append(os.path.basename(file_path)) |
|
processed_count += 1 |
|
|
|
|
|
if memory_optimization and j % gc_collect_interval == 0 and j > 0: |
|
import gc |
|
gc.collect() |
|
status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)") |
|
except Exception as e: |
|
st.warning(f"Error processing {file_path}: {str(e)}") |
|
|
|
|
|
progress = min(1.0, (i + len(batch_files)) / total_files) |
|
progress_bar.progress(progress) |
|
status_text.text(f"Processed {processed_count}/{total_files} files...") |
|
|
|
|
|
if memory_optimization: |
|
import gc |
|
gc.collect() |
|
|
|
|
|
if memory_optimization: |
|
import gc |
|
gc.collect() |
|
|
|
st.session_state.resumes_uploaded = True |
|
st.success(f"Successfully processed {processed_count} out of {total_files} resume files.") |
|
else: |
|
st.error(f"No matching files found in {resume_dir}") |
|
else: |
|
st.error(f"Directory {resume_dir} does not exist or is not accessible.") |
|
elif upload_option == "Upload from Dataset": |
|
|
|
st.write("Upload a CSV file containing resume data or load from available datasets.") |
|
|
|
|
|
hf_datasets = get_huggingface_spaces_datasets() |
|
|
|
if hf_datasets: |
|
st.subheader("Available Datasets in Hugging Face Spaces") |
|
dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets] |
|
selected_dataset = st.selectbox("Select a dataset:", dataset_options) |
|
|
|
if selected_dataset != "None": |
|
selected_index = dataset_options.index(selected_dataset) - 1 |
|
dataset_path = hf_datasets[selected_index] |
|
|
|
if isinstance(dataset_path, tuple): |
|
|
|
pdf_dir = dataset_path[0] |
|
st.write(f"Selected PDF directory: {pdf_dir}") |
|
|
|
batch_size = st.number_input( |
|
"Number of files to process per batch:", |
|
min_value=10, |
|
max_value=1000, |
|
value=100, |
|
step=10 |
|
) |
|
|
|
if st.button("Process PDF Directory"): |
|
|
|
if os.path.isdir(pdf_dir): |
|
all_files = [ |
|
os.path.join(pdf_dir, f) |
|
for f in os.listdir(pdf_dir) |
|
if f.lower().endswith('.pdf') |
|
] |
|
|
|
if all_files: |
|
total_files = len(all_files) |
|
st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...") |
|
|
|
|
|
processed_count = 0 |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
|
|
for i in range(0, total_files, batch_size): |
|
batch_files = all_files[i:i+batch_size] |
|
|
|
for j, file_path in enumerate(batch_files): |
|
try: |
|
text = screener.extract_text_from_file(file_path, "pdf") |
|
if text: |
|
resume_texts.append(text) |
|
file_names.append(os.path.basename(file_path)) |
|
processed_count += 1 |
|
|
|
|
|
if memory_optimization and j % gc_collect_interval == 0 and j > 0: |
|
import gc |
|
gc.collect() |
|
except Exception as e: |
|
st.warning(f"Error processing {file_path}: {str(e)}") |
|
|
|
|
|
progress = min(1.0, (i + len(batch_files)) / total_files) |
|
progress_bar.progress(progress) |
|
status_text.text(f"Processed {processed_count}/{total_files} files...") |
|
|
|
|
|
if memory_optimization: |
|
import gc |
|
gc.collect() |
|
|
|
st.session_state.resumes_uploaded = True |
|
st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.") |
|
else: |
|
|
|
st.write(f"Selected CSV dataset: {dataset_path}") |
|
|
|
try: |
|
|
|
df = pd.read_csv(dataset_path) |
|
|
|
|
|
text_column = st.selectbox( |
|
"Select column containing resume text:", |
|
df.columns.tolist() |
|
) |
|
|
|
if st.button("Process Selected CSV"): |
|
|
|
for i, row in df.iterrows(): |
|
text = str(row[text_column]) |
|
if text and not pd.isna(text): |
|
resume_texts.append(text) |
|
|
|
file_name = f"resume_{i}.txt" |
|
if 'filename' in df.columns: |
|
file_name = row['filename'] |
|
file_names.append(file_name) |
|
|
|
st.session_state.resumes_uploaded = True |
|
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.") |
|
except Exception as e: |
|
st.error(f"Error processing CSV: {str(e)}") |
|
|
|
|
|
dataset_option = st.radio( |
|
"Dataset source:", |
|
["Upload CSV", "Use Hugging Face Dataset"] |
|
) |
|
|
|
if dataset_option == "Upload CSV": |
|
csv_file = st.file_uploader( |
|
"Upload CSV file containing resume data", |
|
type=["csv"], |
|
help="CSV should contain at least a column with resume text." |
|
) |
|
|
|
if csv_file: |
|
with st.spinner("Processing CSV data..."): |
|
|
|
df = pd.read_csv(csv_file) |
|
|
|
|
|
text_column = st.selectbox( |
|
"Select column containing resume text:", |
|
df.columns.tolist() |
|
) |
|
|
|
if st.button("Process Dataset"): |
|
|
|
for i, row in df.iterrows(): |
|
text = str(row[text_column]) |
|
if text and not pd.isna(text): |
|
resume_texts.append(text) |
|
|
|
file_name = f"resume_{i}.txt" |
|
if 'filename' in df.columns: |
|
file_name = row['filename'] |
|
file_names.append(file_name) |
|
|
|
st.session_state.resumes_uploaded = True |
|
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.") |
|
else: |
|
|
|
dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):") |
|
split = st.text_input("Enter dataset split (e.g., 'train'):", "train") |
|
|
|
if dataset_name and st.button("Load Dataset"): |
|
with st.spinner(f"Loading dataset {dataset_name}..."): |
|
try: |
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset(dataset_name, split=split) |
|
|
|
|
|
st.write(f"Dataset loaded with {len(dataset)} entries.") |
|
|
|
|
|
if len(dataset.column_names) > 0: |
|
text_column = st.selectbox( |
|
"Select column containing resume text:", |
|
dataset.column_names |
|
) |
|
|
|
if st.button("Process Hugging Face Dataset"): |
|
|
|
for i, item in enumerate(dataset): |
|
if text_column in item: |
|
text = str(item[text_column]) |
|
if text: |
|
resume_texts.append(text) |
|
|
|
file_name = f"resume_{i}.txt" |
|
if 'id' in item: |
|
file_name = f"resume_{item['id']}.txt" |
|
file_names.append(file_name) |
|
|
|
st.session_state.resumes_uploaded = True |
|
st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.") |
|
except Exception as e: |
|
st.error(f"Error loading dataset: {str(e)}") |
|
st.info("Make sure you have the 'datasets' library installed: pip install datasets") |
|
|
|
|
|
if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)): |
|
with st.spinner("Processing job description and resumes..."): |
|
|
|
job_embedding = screener.get_embedding(job_description) |
|
|
|
|
|
resume_embeddings = [] |
|
batch_size = 10 |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
|
|
for i in range(0, len(resume_texts), batch_size): |
|
batch = resume_texts[i:i+batch_size] |
|
status_text.text(f"Processing resumes {i+1}-{min(i+batch_size, len(resume_texts))} of {len(resume_texts)}...") |
|
|
|
batch_embeddings = [] |
|
for j, text in enumerate(batch): |
|
embedding = screener.get_embedding(text) |
|
batch_embeddings.append(embedding) |
|
|
|
progress = (i + j + 1) / len(resume_texts) |
|
progress_bar.progress(progress) |
|
|
|
|
|
resume_embeddings.extend(batch_embeddings) |
|
|
|
|
|
import gc |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
status_text.text("Calculating similarity scores...") |
|
|
|
|
|
hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores( |
|
resume_texts, |
|
resume_embeddings, |
|
job_embedding, |
|
semantic_weight, |
|
use_faiss |
|
) |
|
|
|
|
|
combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores)) |
|
sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True) |
|
top_candidates = sorted_data[:int(top_k)] |
|
|
|
|
|
results = [] |
|
status_text.text("Generating explanations...") |
|
|
|
for idx, (name, text, score, semantic_score, bm25_score) in enumerate(top_candidates): |
|
|
|
skills = screener.extract_skills(text, job_description) |
|
|
|
result = { |
|
"filename": name, |
|
"score": score, |
|
"semantic_score": semantic_score, |
|
"keyword_score": bm25_score, |
|
"text_preview": text[:500] + "...", |
|
"matched_phrases": screener.extract_key_phrases(text, job_description), |
|
"skills": skills |
|
} |
|
|
|
if use_explanation: |
|
|
|
progress_bar.progress((idx + 1) / len(top_candidates)) |
|
status_text.text(f"Generating explanation for candidate {idx+1}/{len(top_candidates)}...") |
|
|
|
explanation = screener.generate_explanation( |
|
text, |
|
job_description, |
|
score, |
|
semantic_score, |
|
bm25_score, |
|
skills |
|
) |
|
result["explanation"] = explanation |
|
|
|
|
|
torch.cuda.empty_cache() |
|
else: |
|
result["explanation"] = "" |
|
|
|
results.append(result) |
|
|
|
st.session_state.results = results |
|
st.success(f"Found top {len(results)} candidates!") |
|
|
|
|
|
if st.session_state.results: |
|
st.header("3. Results") |
|
|
|
|
|
df_data = [] |
|
for result in st.session_state.results: |
|
df_data.append({ |
|
"Filename": result["filename"], |
|
"Score": result["score"], |
|
"Semantic Score": result["semantic_score"], |
|
"Keyword Score": result["keyword_score"], |
|
"Skills": ", ".join(result["skills"]), |
|
"Explanation": result["explanation"] |
|
}) |
|
|
|
results_df = pd.DataFrame(df_data) |
|
|
|
|
|
st.markdown(get_csv_download_link(results_df), unsafe_allow_html=True) |
|
|
|
|
|
for i, result in enumerate(st.session_state.results): |
|
with st.expander(f"#{i+1}: {result['filename']} (Score: {result['score']:.4f})"): |
|
col1, col2 = st.columns([1, 1]) |
|
|
|
with col1: |
|
st.subheader("Scores") |
|
st.write(f"Total Score: {result['score']:.4f}") |
|
st.write(f"Semantic Score: {result['semantic_score']:.4f}") |
|
st.write(f"Keyword Score: {result['keyword_score']:.4f}") |
|
|
|
st.subheader("Matched Skills") |
|
if result["skills"]: |
|
for skill in result["skills"]: |
|
st.write(f"• {skill}") |
|
else: |
|
st.write("No specific skills matched.") |
|
|
|
with col2: |
|
st.subheader("Explanation") |
|
st.write(result["explanation"]) |
|
|
|
st.subheader("Key Matches") |
|
for phrase in result["matched_phrases"]: |
|
st.markdown(f"• {phrase}") |
|
|
|
st.subheader("Resume Preview") |
|
st.text_area("", result["text_preview"], height=150, disabled=True) |
|
|
|
|
|
st.subheader("Score Comparison") |
|
|
|
|
|
chart_data = pd.DataFrame({ |
|
"Resume": [result["filename"] for result in st.session_state.results], |
|
"Semantic Score": [result["semantic_score"] for result in st.session_state.results], |
|
"Keyword Score": [result["keyword_score"] for result in st.session_state.results], |
|
"Total Score": [result["score"] for result in st.session_state.results] |
|
}) |
|
|
|
|
|
st.bar_chart(chart_data.set_index("Resume")[["Total Score", "Semantic Score", "Keyword Score"]]) |
|
|
|
|
|
st.markdown("---") |
|
st.markdown("Built with Streamlit and Hugging Face models (NV-Embed-v2 and QwQ-32B)") |
|
|