Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

Resume_Screener_and_Skill_Extractor / app.py

root

d57693d about 2 months ago

42.6 kB

	import streamlit as st
	import pdfplumber
	import pandas as pd
	import numpy as np
	import torch
	import nltk
	import faiss
	import os
	import tempfile
	import base64
	from rank_bm25 import BM25Okapi
	from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
	from sentence_transformers import SentenceTransformer
	from nltk.tokenize import word_tokenize, sent_tokenize
	from tqdm import tqdm
	import re
	import io
	import PyPDF2
	from docx import Document
	import csv
	from explanation_generator import ExplanationGenerator

	# Download NLTK resources
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')

	# Initialize embedding model at startup
	EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
	print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")

	try:
	# Configure 4-bit quantization for better memory efficiency
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True
	)

	# Load embedding model and tokenizer with 4-bit quantization
	global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
	global_embedding_model = AutoModel.from_pretrained(
	EMBEDDING_MODEL_NAME,
	trust_remote_code=True,
	device_map="auto",
	quantization_config=quantization_config,
	torch_dtype=torch.float16
	)
	print(f"Successfully loaded {EMBEDDING_MODEL_NAME} with 4-bit quantization")
	except Exception as e:
	print(f"Error loading embedding model: {str(e)}")
	global_embedding_tokenizer = None
	global_embedding_model = None

	# Set page configuration
	st.set_page_config(
	page_title="Resume Screener & Skill Extractor",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Sidebar for model selection and weights
	with st.sidebar:
	st.title("Configuration")

	# Model selection
	embedding_model_name = st.selectbox(
	"Embedding Model",
	["nvidia/NV-Embed-v2"],
	index=0
	)

	explanation_model_name = st.selectbox(
	"Explanation Model",
	["Qwen/QwQ-32B"],
	index=0
	)

	# Ranking weights
	st.subheader("Ranking Weights")
	semantic_weight = st.slider("Semantic Similarity Weight", 0.0, 1.0, 0.7, 0.1)
	keyword_weight = 1.0 - semantic_weight
	st.write(f"Keyword Weight: {keyword_weight:.1f}")

	# Advanced options
	st.subheader("Advanced Options")
	top_k = st.number_input("Number of results to display", min_value=1, max_value=20, value=10, step=1)
	use_explanation = st.checkbox("Generate Explanations", value=True)
	use_faiss = st.checkbox("Use FAISS for fast search", value=True)

	# Memory optimization options
	st.subheader("Memory Optimization")
	memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
	clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
	gc_collect_interval = st.number_input(
	"Garbage collection interval (files)",
	min_value=10,
	max_value=1000,
	value=100,
	step=10,
	help="Run garbage collection after processing this many files"
	)

	st.markdown("---")
	st.markdown("### About")
	st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")

	# Initialize session state variables
	if 'resumes_uploaded' not in st.session_state:
	st.session_state.resumes_uploaded = False
	if 'job_description' not in st.session_state:
	st.session_state.job_description = ""
	if 'results' not in st.session_state:
	st.session_state.results = []
	if 'embedding_model' not in st.session_state:
	st.session_state.embedding_model = global_embedding_model
	if 'tokenizer' not in st.session_state:
	st.session_state.tokenizer = global_embedding_tokenizer
	if 'faiss_index' not in st.session_state:
	st.session_state.faiss_index = None
	if 'explanation_generator' not in st.session_state:
	st.session_state.explanation_generator = None

	class ResumeScreener:
	def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"):
	"""Initialize the ResumeScreener with the specified embedding model"""
	self.embedding_model_name = embedding_model_name
	self.explanation_model_name = explanation_model_name
	# Initialize with preloaded models
	self.model = st.session_state.embedding_model
	self.tokenizer = st.session_state.tokenizer
	self.faiss_index = None
	self.embedding_size = None
	self.explanation_generator = None

	# Initialize explanation generator
	if use_explanation and st.session_state.explanation_generator is None:
	with st.spinner("Initializing explanation generator..."):
	st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
	self.explanation_generator = st.session_state.explanation_generator
	elif use_explanation:
	self.explanation_generator = st.session_state.explanation_generator

	def extract_text_from_file(self, file, file_type):
	"""Extract text from various file types"""
	try:
	if file_type == "pdf":
	# Use pdfplumber for better text extraction
	with pdfplumber.open(file) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text() or ""

	# If pdfplumber fails, try PyPDF2 as fallback
	if not text.strip():
	reader = PyPDF2.PdfReader(file)
	text = ""
	for page_num in range(len(reader.pages)):
	page = reader.pages[page_num]
	text += page.extract_text() or ""

	return text

	elif file_type == "docx":
	doc = Document(file)
	return " ".join([paragraph.text for paragraph in doc.paragraphs])

	elif file_type == "txt":
	return file.read().decode("utf-8")

	elif file_type == "csv":
	csv_text = ""
	csv_reader = csv.reader(io.StringIO(file.read().decode("utf-8")))
	for row in csv_reader:
	csv_text += " ".join(row) + " "
	return csv_text

	else:
	st.error(f"Unsupported file type: {file_type}")
	return ""

	except Exception as e:
	st.error(f"Error extracting text from file: {str(e)}")
	return ""

	def get_embedding(self, text):
	"""Generate text embedding for a given text"""
	if self.model is None:
	st.error("Embedding model not available. Please check your environment.")
	return np.zeros(768) # Default embedding size as fallback

	try:
	# For long texts, split into smaller chunks to avoid OOM
	max_length = 256 # Reduced from default 512 to save memory

	# Truncate text and tokenize
	inputs = self.tokenizer(
	text,
	return_tensors="pt",
	truncation=True,
	max_length=max_length,
	padding=True
	)

	# Move inputs to same device as model
	device = next(self.model.parameters()).device
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Free up memory before inference
	torch.cuda.empty_cache()

	with torch.no_grad():
	outputs = self.model(**inputs)

	# Use [CLS] token embedding or mean pooling based on model architecture
	if hasattr(outputs, "last_hidden_state"):
	# Mean pooling across token dimension
	embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
	embedding_np = embeddings.detach().cpu().numpy()

	# Set embedding size if not set
	if self.embedding_size is None:
	self.embedding_size = embedding_np.shape[0]

	# Clear cache after getting embedding
	del outputs, embeddings
	torch.cuda.empty_cache()

	return embedding_np
	else:
	# For models that return a specific embedding
	embedding_np = outputs.detach().cpu().numpy()

	# Set embedding size if not set
	if self.embedding_size is None:
	self.embedding_size = embedding_np.shape[0]

	# Clear cache after getting embedding
	del outputs
	torch.cuda.empty_cache()

	return embedding_np
	except Exception as e:
	st.error(f"Error generating embedding: {str(e)}")
	torch.cuda.empty_cache() # Try to recover memory
	return np.zeros(768) # Default embedding size as fallback

	def create_faiss_index(self, embeddings):
	"""Create a FAISS index for fast similarity search"""
	# Get the dimension of the embeddings
	dimension = embeddings[0].shape[0]

	# Create a FAISS index
	index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity with normalized vectors

	# Add normalized vectors to the index
	embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings])
	index.add(embeddings_normalized)

	return index

	def query_faiss_index(self, index, query_embedding, k=10):
	"""Query the FAISS index with a query embedding"""
	# Normalize query embedding
	query_embedding = query_embedding / np.linalg.norm(query_embedding)

	# Reshape to a row vector if needed
	if len(query_embedding.shape) == 1:
	query_embedding = query_embedding.reshape(1, -1)

	# Query the index
	scores, indices = index.search(query_embedding, k)

	return scores[0], indices[0] # Return the scores and indices as flat arrays

	def calculate_bm25_scores(self, resume_texts, job_description):
	"""Calculate BM25 scores for keyword matching"""
	# Tokenize job description
	job_tokens = word_tokenize(job_description.lower())

	# Prepare corpus from resumes
	corpus = [word_tokenize(resume.lower()) for resume in resume_texts]

	# Initialize BM25
	bm25 = BM25Okapi(corpus)

	# Calculate scores
	scores = bm25.get_scores(job_tokens)

	return scores

	def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
	"""Calculate hybrid scores combining semantic similarity and BM25"""
	# Calculate semantic similarity scores (cosine similarity)
	if use_faiss and len(resume_embeddings) > 10:
	# Create FAISS index if not already created
	if st.session_state.faiss_index is None:
	index = self.create_faiss_index(resume_embeddings)
	st.session_state.faiss_index = index
	else:
	index = st.session_state.faiss_index

	# Query index with job embedding
	faiss_scores, faiss_indices = self.query_faiss_index(index, job_embedding, k=len(resume_embeddings))

	# Create full semantic scores array
	semantic_scores = np.zeros(len(resume_embeddings))
	for i, idx in enumerate(faiss_indices):
	if idx < len(resume_embeddings):
	semantic_scores[idx] = faiss_scores[i]
	else:
	# Direct cosine similarity calculation for smaller datasets
	semantic_scores = []
	for emb in resume_embeddings:
	# Normalize the embeddings for cosine similarity
	emb_norm = emb / np.linalg.norm(emb)
	job_emb_norm = job_embedding / np.linalg.norm(job_embedding)

	# Calculate cosine similarity
	similarity = np.dot(emb_norm, job_emb_norm)
	semantic_scores.append(similarity)

	# Calculate BM25 scores
	bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)

	# Normalize BM25 scores
	if max(bm25_scores) > 0:
	bm25_scores = [score / max(bm25_scores) for score in bm25_scores]

	# Calculate hybrid scores
	keyword_weight = 1.0 - semantic_weight
	hybrid_scores = [
	(semantic_weight * sem_score) + (keyword_weight * bm25_score)
	for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
	]

	return hybrid_scores, semantic_scores, bm25_scores

	def extract_skills(self, text, job_description):
	"""Extract skills from text based on job description"""
	# Simple skill extraction using regex and job description keywords
	# In a real implementation, this could be enhanced with ML-based skill extraction

	# Extract potential skills from job description (words 3 letters or longer)
	potential_skills = set()

	# Common skill-related phrases that might appear in job descriptions
	skill_indicators = ["experience with", "knowledge of", "familiar with", "proficient in",
	"skills in", "expertise in", "background in", "capabilities in",
	"years of experience in", "understanding of", "trained in"]

	# Extract skills from sentences containing skill indicators
	sentences = sent_tokenize(job_description)
	for sentence in sentences:
	sentence_lower = sentence.lower()
	for indicator in skill_indicators:
	if indicator in sentence_lower:
	# Extract words after the indicator, possibly until end of sentence or punctuation
	skills_part = sentence_lower.split(indicator, 1)[1]

	# Extract words, cleaning up symbols
	words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part)
	for word in words:
	if len(word) >= 3: # Only consider words 3 letters or longer
	potential_skills.add(word.lower())

	# Add explicit skills - look for comma-separated lists or bullet points
	skill_lists = re.findall(r'(?:skills\|requirements\|qualifications)[^\n.]*?:(.+?)(?:\n\|$)', job_description.lower())
	for skill_list in skill_lists:
	words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list)
	for word in words:
	if len(word) >= 3:
	potential_skills.add(word.lower())

	# Add common tech skills if they appear in the job description
	common_tech_skills = ["python", "java", "c++", "javascript", "sql", "react", "node.js", "typescript",
	"html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "terraform",
	"git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"]

	for skill in common_tech_skills:
	if skill in job_description.lower():
	potential_skills.add(skill)

	# Find skills in the resume
	matched_skills = []
	for skill in potential_skills:
	# Make it a word boundary search with regex
	pattern = r'\b' + re.escape(skill) + r'\b'
	matches = re.findall(pattern, text.lower())
	if matches:
	matched_skills.append(skill)

	return list(set(matched_skills))

	def extract_key_phrases(self, text, job_description):
	"""Extract key phrases from text that match job description keywords"""
	# Identify job skills first
	skills = self.extract_skills(job_description, job_description)

	# Extract sentences that contain skills
	sentences = sent_tokenize(text)
	skill_sentences = []

	for sentence in sentences:
	sentence_lower = sentence.lower()
	for skill in skills:
	if skill in sentence_lower:
	# Append the sentence with the skill highlighted
	highlighted = sentence.replace(skill, f"{skill}")
	skill_sentences.append(highlighted)
	break

	# Get additional generic matches if we don't have enough skill sentences
	if len(skill_sentences) < 5:
	# Simple extraction based on job description keywords
	job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3)
	text_tokens = word_tokenize(text)

	matches = []
	for i, token in enumerate(text_tokens):
	if token.lower() in job_tokens:
	# Get a phrase context (5 words before and after)
	start = max(0, i - 5)
	end = min(len(text_tokens), i + 6)
	phrase = " ".join(text_tokens[start:end])
	matches.append(phrase)

	# Add unique phrases to complement skill sentences
	unique_matches = list(set(matches))
	skill_sentences.extend(unique_matches[:5 - len(skill_sentences)])

	# Return unique phrases, up to 5
	return skill_sentences[:5]

	def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
	"""Generate explanation for why a resume was ranked highly using QwQ-32B model"""
	# Use the explanation generator if available
	if use_explanation and self.explanation_generator:
	return self.explanation_generator.generate_explanation(
	resume_text,
	job_description,
	score,
	semantic_score,
	bm25_score,
	skills
	)
	else:
	# Fallback to simple explanation
	matching_phrases = self.extract_key_phrases(resume_text, job_description)

	explanation = f"This resume received a score of {score:.2f}, with semantic relevance of {semantic_score:.2f} and keyword match of {bm25_score:.2f}. "

	if skills:
	explanation += f"The resume shows experience with key skills: {', '.join(skills[:5])}. "

	if matching_phrases:
	explanation += f"Key matching elements include: {matching_phrases[0]}"

	return explanation

	# Function to create a download link for dataframe as CSV
	def get_csv_download_link(df, filename="results.csv"):
	csv = df.to_csv(index=False)
	b64 = base64.b64encode(csv.encode()).decode()
	href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
	return href

	# Add this new function after the get_csv_download_link function
	def get_huggingface_spaces_datasets():
	"""Check for datasets in Hugging Face Spaces environment"""
	datasets = []

	# Common dataset paths in Hugging Face Spaces
	potential_paths = [
	"/data", # Common mount point
	"data", # Relative path
	os.path.expanduser("~/data"), # Home directory
	]

	for path in potential_paths:
	if os.path.exists(path) and os.path.isdir(path):
	# Look for CSV files
	csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
	for csv_file in csv_files:
	datasets.append(os.path.join(path, csv_file))

	# Look for directories that might contain PDFs
	for subdir in os.listdir(path):
	subdir_path = os.path.join(path, subdir)
	if os.path.isdir(subdir_path):
	pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
	if pdf_count > 0:
	datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))

	return datasets

	# Main app UI
	st.title("Resume Screener & Skill Extractor")
	st.markdown("---")

	# Initialize the resume screener
	screener = ResumeScreener(embedding_model_name, explanation_model_name)

	# Job description input
	st.header("1. Enter Job Description")
	job_description = st.text_area(
	"Paste the job description or requirements here:",
	height=200,
	help="Enter the complete job description or a list of required skills and qualifications."
	)

	# Resume upload
	st.header("2. Upload Resumes")
	upload_option = st.radio(
	"Choose upload method:",
	["Upload Files", "Upload from Dataset", "Process Directory"]
	)

	uploaded_files = []
	resume_texts = []
	file_names = []

	if upload_option == "Upload Files":
	uploaded_files = st.file_uploader(
	"Upload resume files",
	type=["pdf", "docx", "txt", "csv"],
	accept_multiple_files=True,
	help="Upload multiple resume files in PDF, DOCX, TXT, or CSV format."
	)

	if uploaded_files:
	with st.spinner("Processing resumes..."):
	for file in uploaded_files:
	file_type = file.name.split('.')[-1].lower()

	with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
	tmp_file.write(file.getvalue())
	tmp_path = tmp_file.name

	text = screener.extract_text_from_file(tmp_path, file_type)
	if text:
	resume_texts.append(text)
	file_names.append(file.name)

	# Clean up temp file
	os.unlink(tmp_path)

	st.session_state.resumes_uploaded = True
	st.success(f"Successfully processed {len(resume_texts)} resumes.")
	elif upload_option == "Process Directory":
	st.write("Process resume files from a directory on the server.")

	# Input for directory path
	resume_dir = st.text_input(
	"Enter the path to the directory containing resume files:",
	help="For Hugging Face Spaces, this could be a mounted directory or dataset."
	)

	# Limit batch size
	batch_size = st.number_input(
	"Number of files to process per batch (lower for less memory usage):",
	min_value=10,
	max_value=1000,
	value=100,
	step=10
	)

	# File types to process
	file_types = st.multiselect(
	"Select file types to process:",
	["pdf", "docx", "txt", "csv"],
	default=["pdf"]
	)

	if resume_dir and st.button("Process Directory"):
	if os.path.isdir(resume_dir):
	# Get all files matching the selected types
	all_files = []
	for file_type in file_types:
	all_files.extend([
	os.path.join(resume_dir, f)
	for f in os.listdir(resume_dir)
	if f.lower().endswith(f'.{file_type}')
	])

	if all_files:
	total_files = len(all_files)
	st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")

	# Process in batches
	processed_count = 0
	progress_bar = st.progress(0)
	status_text = st.empty()

	for i in range(0, total_files, batch_size):
	batch_files = all_files[i:i+batch_size]

	for j, file_path in enumerate(batch_files):
	try:
	file_type = file_path.split('.')[-1].lower()
	text = screener.extract_text_from_file(file_path, file_type)
	if text:
	resume_texts.append(text)
	file_names.append(os.path.basename(file_path))
	processed_count += 1

	# Apply memory optimization if enabled
	if memory_optimization and j % gc_collect_interval == 0 and j > 0:
	import gc
	gc.collect()
	status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
	except Exception as e:
	st.warning(f"Error processing {file_path}: {str(e)}")

	# Update progress
	progress = min(1.0, (i + len(batch_files)) / total_files)
	progress_bar.progress(progress)
	status_text.text(f"Processed {processed_count}/{total_files} files...")

	# Run garbage collection between batches if memory optimization is enabled
	if memory_optimization:
	import gc
	gc.collect()

	# Final garbage collection if memory optimization is enabled
	if memory_optimization:
	import gc
	gc.collect()

	st.session_state.resumes_uploaded = True
	st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
	else:
	st.error(f"No matching files found in {resume_dir}")
	else:
	st.error(f"Directory {resume_dir} does not exist or is not accessible.")
	elif upload_option == "Upload from Dataset":
	# Upload from Dataset implementation
	st.write("Upload a CSV file containing resume data or load from available datasets.")

	# Check for available datasets in Hugging Face Spaces
	hf_datasets = get_huggingface_spaces_datasets()

	if hf_datasets:
	st.subheader("Available Datasets in Hugging Face Spaces")
	dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
	selected_dataset = st.selectbox("Select a dataset:", dataset_options)

	if selected_dataset != "None":
	selected_index = dataset_options.index(selected_dataset) - 1 # Adjust for "None"
	dataset_path = hf_datasets[selected_index]

	if isinstance(dataset_path, tuple):
	# It's a PDF directory
	pdf_dir = dataset_path[0]
	st.write(f"Selected PDF directory: {pdf_dir}")

	batch_size = st.number_input(
	"Number of files to process per batch:",
	min_value=10,
	max_value=1000,
	value=100,
	step=10
	)

	if st.button("Process PDF Directory"):
	# Use the same processing logic as in the "Process Directory" option
	if os.path.isdir(pdf_dir):
	all_files = [
	os.path.join(pdf_dir, f)
	for f in os.listdir(pdf_dir)
	if f.lower().endswith('.pdf')
	]

	if all_files:
	total_files = len(all_files)
	st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")

	# Process in batches
	processed_count = 0
	progress_bar = st.progress(0)
	status_text = st.empty()

	for i in range(0, total_files, batch_size):
	batch_files = all_files[i:i+batch_size]

	for j, file_path in enumerate(batch_files):
	try:
	text = screener.extract_text_from_file(file_path, "pdf")
	if text:
	resume_texts.append(text)
	file_names.append(os.path.basename(file_path))
	processed_count += 1

	# Apply memory optimization if enabled
	if memory_optimization and j % gc_collect_interval == 0 and j > 0:
	import gc
	gc.collect()
	except Exception as e:
	st.warning(f"Error processing {file_path}: {str(e)}")

	# Update progress
	progress = min(1.0, (i + len(batch_files)) / total_files)
	progress_bar.progress(progress)
	status_text.text(f"Processed {processed_count}/{total_files} files...")

	# Memory optimization
	if memory_optimization:
	import gc
	gc.collect()

	st.session_state.resumes_uploaded = True
	st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
	else:
	# It's a CSV file
	st.write(f"Selected CSV dataset: {dataset_path}")

	try:
	# Read the CSV file
	df = pd.read_csv(dataset_path)

	# Let user select which column contains the resume text
	text_column = st.selectbox(
	"Select column containing resume text:",
	df.columns.tolist()
	)

	if st.button("Process Selected CSV"):
	# Extract text from the selected column
	for i, row in df.iterrows():
	text = str(row[text_column])
	if text and not pd.isna(text):
	resume_texts.append(text)
	# Use index as filename if no filename column
	file_name = f"resume_{i}.txt"
	if 'filename' in df.columns:
	file_name = row['filename']
	file_names.append(file_name)

	st.session_state.resumes_uploaded = True
	st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
	except Exception as e:
	st.error(f"Error processing CSV: {str(e)}")

	# Rest of the existing Upload from Dataset code
	dataset_option = st.radio(
	"Dataset source:",
	["Upload CSV", "Use Hugging Face Dataset"]
	)

	if dataset_option == "Upload CSV":
	csv_file = st.file_uploader(
	"Upload CSV file containing resume data",
	type=["csv"],
	help="CSV should contain at least a column with resume text."
	)

	if csv_file:
	with st.spinner("Processing CSV data..."):
	# Read the CSV file
	df = pd.read_csv(csv_file)

	# Let user select which column contains the resume text
	text_column = st.selectbox(
	"Select column containing resume text:",
	df.columns.tolist()
	)

	if st.button("Process Dataset"):
	# Extract text from the selected column
	for i, row in df.iterrows():
	text = str(row[text_column])
	if text and not pd.isna(text):
	resume_texts.append(text)
	# Use index as filename if no filename column
	file_name = f"resume_{i}.txt"
	if 'filename' in df.columns:
	file_name = row['filename']
	file_names.append(file_name)

	st.session_state.resumes_uploaded = True
	st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
	else:
	# Hugging Face Dataset option
	dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
	split = st.text_input("Enter dataset split (e.g., 'train'):", "train")

	if dataset_name and st.button("Load Dataset"):
	with st.spinner(f"Loading dataset {dataset_name}..."):
	try:
	from datasets import load_dataset

	# Load the dataset
	dataset = load_dataset(dataset_name, split=split)

	# Display dataset info
	st.write(f"Dataset loaded with {len(dataset)} entries.")

	# Let user select which column contains the resume text
	if len(dataset.column_names) > 0:
	text_column = st.selectbox(
	"Select column containing resume text:",
	dataset.column_names
	)

	if st.button("Process Hugging Face Dataset"):
	# Extract text from the selected column
	for i, item in enumerate(dataset):
	if text_column in item:
	text = str(item[text_column])
	if text:
	resume_texts.append(text)
	# Use index or id field as filename
	file_name = f"resume_{i}.txt"
	if 'id' in item:
	file_name = f"resume_{item['id']}.txt"
	file_names.append(file_name)

	st.session_state.resumes_uploaded = True
	st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
	except Exception as e:
	st.error(f"Error loading dataset: {str(e)}")
	st.info("Make sure you have the 'datasets' library installed: pip install datasets")

	# Process button
	if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
	with st.spinner("Processing job description and resumes..."):
	# Get job description embedding
	job_embedding = screener.get_embedding(job_description)

	# Process resumes in batches to avoid OOM
	resume_embeddings = []
	batch_size = 10 # Process 10 resumes at a time
	progress_bar = st.progress(0)
	status_text = st.empty()

	for i in range(0, len(resume_texts), batch_size):
	batch = resume_texts[i:i+batch_size]
	status_text.text(f"Processing resumes {i+1}-{min(i+batch_size, len(resume_texts))} of {len(resume_texts)}...")

	batch_embeddings = []
	for j, text in enumerate(batch):
	embedding = screener.get_embedding(text)
	batch_embeddings.append(embedding)
	# Update progress after each resume
	progress = (i + j + 1) / len(resume_texts)
	progress_bar.progress(progress)

	# Add batch embeddings to the full list
	resume_embeddings.extend(batch_embeddings)

	# Force garbage collection between batches
	import gc
	gc.collect()
	torch.cuda.empty_cache()

	status_text.text("Calculating similarity scores...")

	# Calculate hybrid scores
	hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
	resume_texts,
	resume_embeddings,
	job_embedding,
	semantic_weight,
	use_faiss
	)

	# Get top candidates
	combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
	sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True)
	top_candidates = sorted_data[:int(top_k)]

	# Create results with explanations if enabled
	results = []
	status_text.text("Generating explanations...")

	for idx, (name, text, score, semantic_score, bm25_score) in enumerate(top_candidates):
	# Extract skills for this resume
	skills = screener.extract_skills(text, job_description)

	result = {
	"filename": name,
	"score": score,
	"semantic_score": semantic_score,
	"keyword_score": bm25_score,
	"text_preview": text[:500] + "...",
	"matched_phrases": screener.extract_key_phrases(text, job_description),
	"skills": skills
	}

	if use_explanation:
	# Update progress to show explanation generation
	progress_bar.progress((idx + 1) / len(top_candidates))
	status_text.text(f"Generating explanation for candidate {idx+1}/{len(top_candidates)}...")

	explanation = screener.generate_explanation(
	text,
	job_description,
	score,
	semantic_score,
	bm25_score,
	skills
	)
	result["explanation"] = explanation

	# Clear cache after each explanation
	torch.cuda.empty_cache()
	else:
	result["explanation"] = ""

	results.append(result)

	st.session_state.results = results
	st.success(f"Found top {len(results)} candidates!")

	# Display results
	if st.session_state.results:
	st.header("3. Results")

	# Create a DataFrame for download
	df_data = []
	for result in st.session_state.results:
	df_data.append({
	"Filename": result["filename"],
	"Score": result["score"],
	"Semantic Score": result["semantic_score"],
	"Keyword Score": result["keyword_score"],
	"Skills": ", ".join(result["skills"]),
	"Explanation": result["explanation"]
	})

	results_df = pd.DataFrame(df_data)

	# Display download link
	st.markdown(get_csv_download_link(results_df), unsafe_allow_html=True)

	# Display individual results
	for i, result in enumerate(st.session_state.results):
	with st.expander(f"#{i+1}: {result['filename']} (Score: {result['score']:.4f})"):
	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("Scores")
	st.write(f"Total Score: {result['score']:.4f}")
	st.write(f"Semantic Score: {result['semantic_score']:.4f}")
	st.write(f"Keyword Score: {result['keyword_score']:.4f}")

	st.subheader("Matched Skills")
	if result["skills"]:
	for skill in result["skills"]:
	st.write(f"• {skill}")
	else:
	st.write("No specific skills matched.")

	with col2:
	st.subheader("Explanation")
	st.write(result["explanation"])

	st.subheader("Key Matches")
	for phrase in result["matched_phrases"]:
	st.markdown(f"• {phrase}")

	st.subheader("Resume Preview")
	st.text_area("", result["text_preview"], height=150, disabled=True)

	# Visualization of scores
	st.subheader("Score Comparison")

	# Prepare data for visualization
	chart_data = pd.DataFrame({
	"Resume": [result["filename"] for result in st.session_state.results],
	"Semantic Score": [result["semantic_score"] for result in st.session_state.results],
	"Keyword Score": [result["keyword_score"] for result in st.session_state.results],
	"Total Score": [result["score"] for result in st.session_state.results]
	})

	# Display as a bar chart
	st.bar_chart(chart_data.set_index("Resume")[["Total Score", "Semantic Score", "Keyword Score"]])

	# Footer
	st.markdown("---")
	st.markdown("Built with Streamlit and Hugging Face models (NV-Embed-v2 and QwQ-32B)")