root
ss
d57693d
raw
history blame
42.6 kB
import streamlit as st
import pdfplumber
import pandas as pd
import numpy as np
import torch
import nltk
import faiss
import os
import tempfile
import base64
from rank_bm25 import BM25Okapi
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm
import re
import io
import PyPDF2
from docx import Document
import csv
from explanation_generator import ExplanationGenerator
# Download NLTK resources
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Initialize embedding model at startup
EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
try:
# Configure 4-bit quantization for better memory efficiency
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
# Load embedding model and tokenizer with 4-bit quantization
global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
global_embedding_model = AutoModel.from_pretrained(
EMBEDDING_MODEL_NAME,
trust_remote_code=True,
device_map="auto",
quantization_config=quantization_config,
torch_dtype=torch.float16
)
print(f"Successfully loaded {EMBEDDING_MODEL_NAME} with 4-bit quantization")
except Exception as e:
print(f"Error loading embedding model: {str(e)}")
global_embedding_tokenizer = None
global_embedding_model = None
# Set page configuration
st.set_page_config(
page_title="Resume Screener & Skill Extractor",
page_icon="📄",
layout="wide",
initial_sidebar_state="expanded"
)
# Sidebar for model selection and weights
with st.sidebar:
st.title("Configuration")
# Model selection
embedding_model_name = st.selectbox(
"Embedding Model",
["nvidia/NV-Embed-v2"],
index=0
)
explanation_model_name = st.selectbox(
"Explanation Model",
["Qwen/QwQ-32B"],
index=0
)
# Ranking weights
st.subheader("Ranking Weights")
semantic_weight = st.slider("Semantic Similarity Weight", 0.0, 1.0, 0.7, 0.1)
keyword_weight = 1.0 - semantic_weight
st.write(f"Keyword Weight: {keyword_weight:.1f}")
# Advanced options
st.subheader("Advanced Options")
top_k = st.number_input("Number of results to display", min_value=1, max_value=20, value=10, step=1)
use_explanation = st.checkbox("Generate Explanations", value=True)
use_faiss = st.checkbox("Use FAISS for fast search", value=True)
# Memory optimization options
st.subheader("Memory Optimization")
memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
gc_collect_interval = st.number_input(
"Garbage collection interval (files)",
min_value=10,
max_value=1000,
value=100,
step=10,
help="Run garbage collection after processing this many files"
)
st.markdown("---")
st.markdown("### About")
st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
# Initialize session state variables
if 'resumes_uploaded' not in st.session_state:
st.session_state.resumes_uploaded = False
if 'job_description' not in st.session_state:
st.session_state.job_description = ""
if 'results' not in st.session_state:
st.session_state.results = []
if 'embedding_model' not in st.session_state:
st.session_state.embedding_model = global_embedding_model
if 'tokenizer' not in st.session_state:
st.session_state.tokenizer = global_embedding_tokenizer
if 'faiss_index' not in st.session_state:
st.session_state.faiss_index = None
if 'explanation_generator' not in st.session_state:
st.session_state.explanation_generator = None
class ResumeScreener:
def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"):
"""Initialize the ResumeScreener with the specified embedding model"""
self.embedding_model_name = embedding_model_name
self.explanation_model_name = explanation_model_name
# Initialize with preloaded models
self.model = st.session_state.embedding_model
self.tokenizer = st.session_state.tokenizer
self.faiss_index = None
self.embedding_size = None
self.explanation_generator = None
# Initialize explanation generator
if use_explanation and st.session_state.explanation_generator is None:
with st.spinner("Initializing explanation generator..."):
st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
self.explanation_generator = st.session_state.explanation_generator
elif use_explanation:
self.explanation_generator = st.session_state.explanation_generator
def extract_text_from_file(self, file, file_type):
"""Extract text from various file types"""
try:
if file_type == "pdf":
# Use pdfplumber for better text extraction
with pdfplumber.open(file) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
# If pdfplumber fails, try PyPDF2 as fallback
if not text.strip():
reader = PyPDF2.PdfReader(file)
text = ""
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text() or ""
return text
elif file_type == "docx":
doc = Document(file)
return " ".join([paragraph.text for paragraph in doc.paragraphs])
elif file_type == "txt":
return file.read().decode("utf-8")
elif file_type == "csv":
csv_text = ""
csv_reader = csv.reader(io.StringIO(file.read().decode("utf-8")))
for row in csv_reader:
csv_text += " ".join(row) + " "
return csv_text
else:
st.error(f"Unsupported file type: {file_type}")
return ""
except Exception as e:
st.error(f"Error extracting text from file: {str(e)}")
return ""
def get_embedding(self, text):
"""Generate text embedding for a given text"""
if self.model is None:
st.error("Embedding model not available. Please check your environment.")
return np.zeros(768) # Default embedding size as fallback
try:
# For long texts, split into smaller chunks to avoid OOM
max_length = 256 # Reduced from default 512 to save memory
# Truncate text and tokenize
inputs = self.tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=max_length,
padding=True
)
# Move inputs to same device as model
device = next(self.model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Free up memory before inference
torch.cuda.empty_cache()
with torch.no_grad():
outputs = self.model(**inputs)
# Use [CLS] token embedding or mean pooling based on model architecture
if hasattr(outputs, "last_hidden_state"):
# Mean pooling across token dimension
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
embedding_np = embeddings.detach().cpu().numpy()
# Set embedding size if not set
if self.embedding_size is None:
self.embedding_size = embedding_np.shape[0]
# Clear cache after getting embedding
del outputs, embeddings
torch.cuda.empty_cache()
return embedding_np
else:
# For models that return a specific embedding
embedding_np = outputs.detach().cpu().numpy()
# Set embedding size if not set
if self.embedding_size is None:
self.embedding_size = embedding_np.shape[0]
# Clear cache after getting embedding
del outputs
torch.cuda.empty_cache()
return embedding_np
except Exception as e:
st.error(f"Error generating embedding: {str(e)}")
torch.cuda.empty_cache() # Try to recover memory
return np.zeros(768) # Default embedding size as fallback
def create_faiss_index(self, embeddings):
"""Create a FAISS index for fast similarity search"""
# Get the dimension of the embeddings
dimension = embeddings[0].shape[0]
# Create a FAISS index
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity with normalized vectors
# Add normalized vectors to the index
embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings])
index.add(embeddings_normalized)
return index
def query_faiss_index(self, index, query_embedding, k=10):
"""Query the FAISS index with a query embedding"""
# Normalize query embedding
query_embedding = query_embedding / np.linalg.norm(query_embedding)
# Reshape to a row vector if needed
if len(query_embedding.shape) == 1:
query_embedding = query_embedding.reshape(1, -1)
# Query the index
scores, indices = index.search(query_embedding, k)
return scores[0], indices[0] # Return the scores and indices as flat arrays
def calculate_bm25_scores(self, resume_texts, job_description):
"""Calculate BM25 scores for keyword matching"""
# Tokenize job description
job_tokens = word_tokenize(job_description.lower())
# Prepare corpus from resumes
corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
# Initialize BM25
bm25 = BM25Okapi(corpus)
# Calculate scores
scores = bm25.get_scores(job_tokens)
return scores
def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
"""Calculate hybrid scores combining semantic similarity and BM25"""
# Calculate semantic similarity scores (cosine similarity)
if use_faiss and len(resume_embeddings) > 10:
# Create FAISS index if not already created
if st.session_state.faiss_index is None:
index = self.create_faiss_index(resume_embeddings)
st.session_state.faiss_index = index
else:
index = st.session_state.faiss_index
# Query index with job embedding
faiss_scores, faiss_indices = self.query_faiss_index(index, job_embedding, k=len(resume_embeddings))
# Create full semantic scores array
semantic_scores = np.zeros(len(resume_embeddings))
for i, idx in enumerate(faiss_indices):
if idx < len(resume_embeddings):
semantic_scores[idx] = faiss_scores[i]
else:
# Direct cosine similarity calculation for smaller datasets
semantic_scores = []
for emb in resume_embeddings:
# Normalize the embeddings for cosine similarity
emb_norm = emb / np.linalg.norm(emb)
job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
# Calculate cosine similarity
similarity = np.dot(emb_norm, job_emb_norm)
semantic_scores.append(similarity)
# Calculate BM25 scores
bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
# Normalize BM25 scores
if max(bm25_scores) > 0:
bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
# Calculate hybrid scores
keyword_weight = 1.0 - semantic_weight
hybrid_scores = [
(semantic_weight * sem_score) + (keyword_weight * bm25_score)
for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
]
return hybrid_scores, semantic_scores, bm25_scores
def extract_skills(self, text, job_description):
"""Extract skills from text based on job description"""
# Simple skill extraction using regex and job description keywords
# In a real implementation, this could be enhanced with ML-based skill extraction
# Extract potential skills from job description (words 3 letters or longer)
potential_skills = set()
# Common skill-related phrases that might appear in job descriptions
skill_indicators = ["experience with", "knowledge of", "familiar with", "proficient in",
"skills in", "expertise in", "background in", "capabilities in",
"years of experience in", "understanding of", "trained in"]
# Extract skills from sentences containing skill indicators
sentences = sent_tokenize(job_description)
for sentence in sentences:
sentence_lower = sentence.lower()
for indicator in skill_indicators:
if indicator in sentence_lower:
# Extract words after the indicator, possibly until end of sentence or punctuation
skills_part = sentence_lower.split(indicator, 1)[1]
# Extract words, cleaning up symbols
words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part)
for word in words:
if len(word) >= 3: # Only consider words 3 letters or longer
potential_skills.add(word.lower())
# Add explicit skills - look for comma-separated lists or bullet points
skill_lists = re.findall(r'(?:skills|requirements|qualifications)[^\n.]*?:(.+?)(?:\n|$)', job_description.lower())
for skill_list in skill_lists:
words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list)
for word in words:
if len(word) >= 3:
potential_skills.add(word.lower())
# Add common tech skills if they appear in the job description
common_tech_skills = ["python", "java", "c++", "javascript", "sql", "react", "node.js", "typescript",
"html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "terraform",
"git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"]
for skill in common_tech_skills:
if skill in job_description.lower():
potential_skills.add(skill)
# Find skills in the resume
matched_skills = []
for skill in potential_skills:
# Make it a word boundary search with regex
pattern = r'\b' + re.escape(skill) + r'\b'
matches = re.findall(pattern, text.lower())
if matches:
matched_skills.append(skill)
return list(set(matched_skills))
def extract_key_phrases(self, text, job_description):
"""Extract key phrases from text that match job description keywords"""
# Identify job skills first
skills = self.extract_skills(job_description, job_description)
# Extract sentences that contain skills
sentences = sent_tokenize(text)
skill_sentences = []
for sentence in sentences:
sentence_lower = sentence.lower()
for skill in skills:
if skill in sentence_lower:
# Append the sentence with the skill highlighted
highlighted = sentence.replace(skill, f"**{skill}**")
skill_sentences.append(highlighted)
break
# Get additional generic matches if we don't have enough skill sentences
if len(skill_sentences) < 5:
# Simple extraction based on job description keywords
job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3)
text_tokens = word_tokenize(text)
matches = []
for i, token in enumerate(text_tokens):
if token.lower() in job_tokens:
# Get a phrase context (5 words before and after)
start = max(0, i - 5)
end = min(len(text_tokens), i + 6)
phrase = " ".join(text_tokens[start:end])
matches.append(phrase)
# Add unique phrases to complement skill sentences
unique_matches = list(set(matches))
skill_sentences.extend(unique_matches[:5 - len(skill_sentences)])
# Return unique phrases, up to 5
return skill_sentences[:5]
def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
"""Generate explanation for why a resume was ranked highly using QwQ-32B model"""
# Use the explanation generator if available
if use_explanation and self.explanation_generator:
return self.explanation_generator.generate_explanation(
resume_text,
job_description,
score,
semantic_score,
bm25_score,
skills
)
else:
# Fallback to simple explanation
matching_phrases = self.extract_key_phrases(resume_text, job_description)
explanation = f"This resume received a score of {score:.2f}, with semantic relevance of {semantic_score:.2f} and keyword match of {bm25_score:.2f}. "
if skills:
explanation += f"The resume shows experience with key skills: {', '.join(skills[:5])}. "
if matching_phrases:
explanation += f"Key matching elements include: {matching_phrases[0]}"
return explanation
# Function to create a download link for dataframe as CSV
def get_csv_download_link(df, filename="results.csv"):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
return href
# Add this new function after the get_csv_download_link function
def get_huggingface_spaces_datasets():
"""Check for datasets in Hugging Face Spaces environment"""
datasets = []
# Common dataset paths in Hugging Face Spaces
potential_paths = [
"/data", # Common mount point
"data", # Relative path
os.path.expanduser("~/data"), # Home directory
]
for path in potential_paths:
if os.path.exists(path) and os.path.isdir(path):
# Look for CSV files
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
for csv_file in csv_files:
datasets.append(os.path.join(path, csv_file))
# Look for directories that might contain PDFs
for subdir in os.listdir(path):
subdir_path = os.path.join(path, subdir)
if os.path.isdir(subdir_path):
pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
if pdf_count > 0:
datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
return datasets
# Main app UI
st.title("Resume Screener & Skill Extractor")
st.markdown("---")
# Initialize the resume screener
screener = ResumeScreener(embedding_model_name, explanation_model_name)
# Job description input
st.header("1. Enter Job Description")
job_description = st.text_area(
"Paste the job description or requirements here:",
height=200,
help="Enter the complete job description or a list of required skills and qualifications."
)
# Resume upload
st.header("2. Upload Resumes")
upload_option = st.radio(
"Choose upload method:",
["Upload Files", "Upload from Dataset", "Process Directory"]
)
uploaded_files = []
resume_texts = []
file_names = []
if upload_option == "Upload Files":
uploaded_files = st.file_uploader(
"Upload resume files",
type=["pdf", "docx", "txt", "csv"],
accept_multiple_files=True,
help="Upload multiple resume files in PDF, DOCX, TXT, or CSV format."
)
if uploaded_files:
with st.spinner("Processing resumes..."):
for file in uploaded_files:
file_type = file.name.split('.')[-1].lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
tmp_file.write(file.getvalue())
tmp_path = tmp_file.name
text = screener.extract_text_from_file(tmp_path, file_type)
if text:
resume_texts.append(text)
file_names.append(file.name)
# Clean up temp file
os.unlink(tmp_path)
st.session_state.resumes_uploaded = True
st.success(f"Successfully processed {len(resume_texts)} resumes.")
elif upload_option == "Process Directory":
st.write("Process resume files from a directory on the server.")
# Input for directory path
resume_dir = st.text_input(
"Enter the path to the directory containing resume files:",
help="For Hugging Face Spaces, this could be a mounted directory or dataset."
)
# Limit batch size
batch_size = st.number_input(
"Number of files to process per batch (lower for less memory usage):",
min_value=10,
max_value=1000,
value=100,
step=10
)
# File types to process
file_types = st.multiselect(
"Select file types to process:",
["pdf", "docx", "txt", "csv"],
default=["pdf"]
)
if resume_dir and st.button("Process Directory"):
if os.path.isdir(resume_dir):
# Get all files matching the selected types
all_files = []
for file_type in file_types:
all_files.extend([
os.path.join(resume_dir, f)
for f in os.listdir(resume_dir)
if f.lower().endswith(f'.{file_type}')
])
if all_files:
total_files = len(all_files)
st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")
# Process in batches
processed_count = 0
progress_bar = st.progress(0)
status_text = st.empty()
for i in range(0, total_files, batch_size):
batch_files = all_files[i:i+batch_size]
for j, file_path in enumerate(batch_files):
try:
file_type = file_path.split('.')[-1].lower()
text = screener.extract_text_from_file(file_path, file_type)
if text:
resume_texts.append(text)
file_names.append(os.path.basename(file_path))
processed_count += 1
# Apply memory optimization if enabled
if memory_optimization and j % gc_collect_interval == 0 and j > 0:
import gc
gc.collect()
status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
except Exception as e:
st.warning(f"Error processing {file_path}: {str(e)}")
# Update progress
progress = min(1.0, (i + len(batch_files)) / total_files)
progress_bar.progress(progress)
status_text.text(f"Processed {processed_count}/{total_files} files...")
# Run garbage collection between batches if memory optimization is enabled
if memory_optimization:
import gc
gc.collect()
# Final garbage collection if memory optimization is enabled
if memory_optimization:
import gc
gc.collect()
st.session_state.resumes_uploaded = True
st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
else:
st.error(f"No matching files found in {resume_dir}")
else:
st.error(f"Directory {resume_dir} does not exist or is not accessible.")
elif upload_option == "Upload from Dataset":
# Upload from Dataset implementation
st.write("Upload a CSV file containing resume data or load from available datasets.")
# Check for available datasets in Hugging Face Spaces
hf_datasets = get_huggingface_spaces_datasets()
if hf_datasets:
st.subheader("Available Datasets in Hugging Face Spaces")
dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
selected_dataset = st.selectbox("Select a dataset:", dataset_options)
if selected_dataset != "None":
selected_index = dataset_options.index(selected_dataset) - 1 # Adjust for "None"
dataset_path = hf_datasets[selected_index]
if isinstance(dataset_path, tuple):
# It's a PDF directory
pdf_dir = dataset_path[0]
st.write(f"Selected PDF directory: {pdf_dir}")
batch_size = st.number_input(
"Number of files to process per batch:",
min_value=10,
max_value=1000,
value=100,
step=10
)
if st.button("Process PDF Directory"):
# Use the same processing logic as in the "Process Directory" option
if os.path.isdir(pdf_dir):
all_files = [
os.path.join(pdf_dir, f)
for f in os.listdir(pdf_dir)
if f.lower().endswith('.pdf')
]
if all_files:
total_files = len(all_files)
st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
# Process in batches
processed_count = 0
progress_bar = st.progress(0)
status_text = st.empty()
for i in range(0, total_files, batch_size):
batch_files = all_files[i:i+batch_size]
for j, file_path in enumerate(batch_files):
try:
text = screener.extract_text_from_file(file_path, "pdf")
if text:
resume_texts.append(text)
file_names.append(os.path.basename(file_path))
processed_count += 1
# Apply memory optimization if enabled
if memory_optimization and j % gc_collect_interval == 0 and j > 0:
import gc
gc.collect()
except Exception as e:
st.warning(f"Error processing {file_path}: {str(e)}")
# Update progress
progress = min(1.0, (i + len(batch_files)) / total_files)
progress_bar.progress(progress)
status_text.text(f"Processed {processed_count}/{total_files} files...")
# Memory optimization
if memory_optimization:
import gc
gc.collect()
st.session_state.resumes_uploaded = True
st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
else:
# It's a CSV file
st.write(f"Selected CSV dataset: {dataset_path}")
try:
# Read the CSV file
df = pd.read_csv(dataset_path)
# Let user select which column contains the resume text
text_column = st.selectbox(
"Select column containing resume text:",
df.columns.tolist()
)
if st.button("Process Selected CSV"):
# Extract text from the selected column
for i, row in df.iterrows():
text = str(row[text_column])
if text and not pd.isna(text):
resume_texts.append(text)
# Use index as filename if no filename column
file_name = f"resume_{i}.txt"
if 'filename' in df.columns:
file_name = row['filename']
file_names.append(file_name)
st.session_state.resumes_uploaded = True
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
except Exception as e:
st.error(f"Error processing CSV: {str(e)}")
# Rest of the existing Upload from Dataset code
dataset_option = st.radio(
"Dataset source:",
["Upload CSV", "Use Hugging Face Dataset"]
)
if dataset_option == "Upload CSV":
csv_file = st.file_uploader(
"Upload CSV file containing resume data",
type=["csv"],
help="CSV should contain at least a column with resume text."
)
if csv_file:
with st.spinner("Processing CSV data..."):
# Read the CSV file
df = pd.read_csv(csv_file)
# Let user select which column contains the resume text
text_column = st.selectbox(
"Select column containing resume text:",
df.columns.tolist()
)
if st.button("Process Dataset"):
# Extract text from the selected column
for i, row in df.iterrows():
text = str(row[text_column])
if text and not pd.isna(text):
resume_texts.append(text)
# Use index as filename if no filename column
file_name = f"resume_{i}.txt"
if 'filename' in df.columns:
file_name = row['filename']
file_names.append(file_name)
st.session_state.resumes_uploaded = True
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
else:
# Hugging Face Dataset option
dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
if dataset_name and st.button("Load Dataset"):
with st.spinner(f"Loading dataset {dataset_name}..."):
try:
from datasets import load_dataset
# Load the dataset
dataset = load_dataset(dataset_name, split=split)
# Display dataset info
st.write(f"Dataset loaded with {len(dataset)} entries.")
# Let user select which column contains the resume text
if len(dataset.column_names) > 0:
text_column = st.selectbox(
"Select column containing resume text:",
dataset.column_names
)
if st.button("Process Hugging Face Dataset"):
# Extract text from the selected column
for i, item in enumerate(dataset):
if text_column in item:
text = str(item[text_column])
if text:
resume_texts.append(text)
# Use index or id field as filename
file_name = f"resume_{i}.txt"
if 'id' in item:
file_name = f"resume_{item['id']}.txt"
file_names.append(file_name)
st.session_state.resumes_uploaded = True
st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
except Exception as e:
st.error(f"Error loading dataset: {str(e)}")
st.info("Make sure you have the 'datasets' library installed: pip install datasets")
# Process button
if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
with st.spinner("Processing job description and resumes..."):
# Get job description embedding
job_embedding = screener.get_embedding(job_description)
# Process resumes in batches to avoid OOM
resume_embeddings = []
batch_size = 10 # Process 10 resumes at a time
progress_bar = st.progress(0)
status_text = st.empty()
for i in range(0, len(resume_texts), batch_size):
batch = resume_texts[i:i+batch_size]
status_text.text(f"Processing resumes {i+1}-{min(i+batch_size, len(resume_texts))} of {len(resume_texts)}...")
batch_embeddings = []
for j, text in enumerate(batch):
embedding = screener.get_embedding(text)
batch_embeddings.append(embedding)
# Update progress after each resume
progress = (i + j + 1) / len(resume_texts)
progress_bar.progress(progress)
# Add batch embeddings to the full list
resume_embeddings.extend(batch_embeddings)
# Force garbage collection between batches
import gc
gc.collect()
torch.cuda.empty_cache()
status_text.text("Calculating similarity scores...")
# Calculate hybrid scores
hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
resume_texts,
resume_embeddings,
job_embedding,
semantic_weight,
use_faiss
)
# Get top candidates
combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True)
top_candidates = sorted_data[:int(top_k)]
# Create results with explanations if enabled
results = []
status_text.text("Generating explanations...")
for idx, (name, text, score, semantic_score, bm25_score) in enumerate(top_candidates):
# Extract skills for this resume
skills = screener.extract_skills(text, job_description)
result = {
"filename": name,
"score": score,
"semantic_score": semantic_score,
"keyword_score": bm25_score,
"text_preview": text[:500] + "...",
"matched_phrases": screener.extract_key_phrases(text, job_description),
"skills": skills
}
if use_explanation:
# Update progress to show explanation generation
progress_bar.progress((idx + 1) / len(top_candidates))
status_text.text(f"Generating explanation for candidate {idx+1}/{len(top_candidates)}...")
explanation = screener.generate_explanation(
text,
job_description,
score,
semantic_score,
bm25_score,
skills
)
result["explanation"] = explanation
# Clear cache after each explanation
torch.cuda.empty_cache()
else:
result["explanation"] = ""
results.append(result)
st.session_state.results = results
st.success(f"Found top {len(results)} candidates!")
# Display results
if st.session_state.results:
st.header("3. Results")
# Create a DataFrame for download
df_data = []
for result in st.session_state.results:
df_data.append({
"Filename": result["filename"],
"Score": result["score"],
"Semantic Score": result["semantic_score"],
"Keyword Score": result["keyword_score"],
"Skills": ", ".join(result["skills"]),
"Explanation": result["explanation"]
})
results_df = pd.DataFrame(df_data)
# Display download link
st.markdown(get_csv_download_link(results_df), unsafe_allow_html=True)
# Display individual results
for i, result in enumerate(st.session_state.results):
with st.expander(f"#{i+1}: {result['filename']} (Score: {result['score']:.4f})"):
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("Scores")
st.write(f"Total Score: {result['score']:.4f}")
st.write(f"Semantic Score: {result['semantic_score']:.4f}")
st.write(f"Keyword Score: {result['keyword_score']:.4f}")
st.subheader("Matched Skills")
if result["skills"]:
for skill in result["skills"]:
st.write(f"• {skill}")
else:
st.write("No specific skills matched.")
with col2:
st.subheader("Explanation")
st.write(result["explanation"])
st.subheader("Key Matches")
for phrase in result["matched_phrases"]:
st.markdown(f"• {phrase}")
st.subheader("Resume Preview")
st.text_area("", result["text_preview"], height=150, disabled=True)
# Visualization of scores
st.subheader("Score Comparison")
# Prepare data for visualization
chart_data = pd.DataFrame({
"Resume": [result["filename"] for result in st.session_state.results],
"Semantic Score": [result["semantic_score"] for result in st.session_state.results],
"Keyword Score": [result["keyword_score"] for result in st.session_state.results],
"Total Score": [result["score"] for result in st.session_state.results]
})
# Display as a bar chart
st.bar_chart(chart_data.set_index("Resume")[["Total Score", "Semantic Score", "Keyword Score"]])
# Footer
st.markdown("---")
st.markdown("Built with Streamlit and Hugging Face models (NV-Embed-v2 and QwQ-32B)")