Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

File size: 42,566 Bytes

import streamlit as st
import pdfplumber
import pandas as pd
import numpy as np
import torch
import nltk
import faiss
import os
import tempfile
import base64
from rank_bm25 import BM25Okapi
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm
import re
import io
import PyPDF2
from docx import Document
import csv
from explanation_generator import ExplanationGenerator

# Download NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Initialize embedding model at startup
EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")

try:
    # Configure 4-bit quantization for better memory efficiency
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    
    # Load embedding model and tokenizer with 4-bit quantization
    global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
    global_embedding_model = AutoModel.from_pretrained(
        EMBEDDING_MODEL_NAME, 
        trust_remote_code=True, 
        device_map="auto",
        quantization_config=quantization_config,
        torch_dtype=torch.float16
    )
    print(f"Successfully loaded {EMBEDDING_MODEL_NAME} with 4-bit quantization")
except Exception as e:
    print(f"Error loading embedding model: {str(e)}")
    global_embedding_tokenizer = None
    global_embedding_model = None

# Set page configuration
st.set_page_config(
    page_title="Resume Screener & Skill Extractor",
    page_icon="📄",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Sidebar for model selection and weights
with st.sidebar:
    st.title("Configuration")
    
    # Model selection
    embedding_model_name = st.selectbox(
        "Embedding Model",
        ["nvidia/NV-Embed-v2"],
        index=0
    )
    
    explanation_model_name = st.selectbox(
        "Explanation Model",
        ["Qwen/QwQ-32B"],
        index=0
    )
    
    # Ranking weights
    st.subheader("Ranking Weights")
    semantic_weight = st.slider("Semantic Similarity Weight", 0.0, 1.0, 0.7, 0.1)
    keyword_weight = 1.0 - semantic_weight
    st.write(f"Keyword Weight: {keyword_weight:.1f}")
    
    # Advanced options
    st.subheader("Advanced Options")
    top_k = st.number_input("Number of results to display", min_value=1, max_value=20, value=10, step=1)
    use_explanation = st.checkbox("Generate Explanations", value=True)
    use_faiss = st.checkbox("Use FAISS for fast search", value=True)
    
    # Memory optimization options
    st.subheader("Memory Optimization")
    memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
    clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
    gc_collect_interval = st.number_input(
        "Garbage collection interval (files)",
        min_value=10,
        max_value=1000,
        value=100,
        step=10,
        help="Run garbage collection after processing this many files"
    )
    
    st.markdown("---")
    st.markdown("### About")
    st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")

# Initialize session state variables
if 'resumes_uploaded' not in st.session_state:
    st.session_state.resumes_uploaded = False
if 'job_description' not in st.session_state:
    st.session_state.job_description = ""
if 'results' not in st.session_state:
    st.session_state.results = []
if 'embedding_model' not in st.session_state:
    st.session_state.embedding_model = global_embedding_model
if 'tokenizer' not in st.session_state:
    st.session_state.tokenizer = global_embedding_tokenizer
if 'faiss_index' not in st.session_state:
    st.session_state.faiss_index = None
if 'explanation_generator' not in st.session_state:
    st.session_state.explanation_generator = None

class ResumeScreener:
    def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"):
        """Initialize the ResumeScreener with the specified embedding model"""
        self.embedding_model_name = embedding_model_name
        self.explanation_model_name = explanation_model_name
        # Initialize with preloaded models
        self.model = st.session_state.embedding_model
        self.tokenizer = st.session_state.tokenizer
        self.faiss_index = None
        self.embedding_size = None
        self.explanation_generator = None
        
        # Initialize explanation generator
        if use_explanation and st.session_state.explanation_generator is None:
            with st.spinner("Initializing explanation generator..."):
                st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
            self.explanation_generator = st.session_state.explanation_generator
        elif use_explanation:
            self.explanation_generator = st.session_state.explanation_generator
    
    def extract_text_from_file(self, file, file_type):
        """Extract text from various file types"""
        try:
            if file_type == "pdf":
                # Use pdfplumber for better text extraction
                with pdfplumber.open(file) as pdf:
                    text = ""
                    for page in pdf.pages:
                        text += page.extract_text() or ""
                    
                    # If pdfplumber fails, try PyPDF2 as fallback
                    if not text.strip():
                        reader = PyPDF2.PdfReader(file)
                        text = ""
                        for page_num in range(len(reader.pages)):
                            page = reader.pages[page_num]
                            text += page.extract_text() or ""
                            
                    return text
                    
            elif file_type == "docx":
                doc = Document(file)
                return " ".join([paragraph.text for paragraph in doc.paragraphs])
                
            elif file_type == "txt":
                return file.read().decode("utf-8")
                
            elif file_type == "csv":
                csv_text = ""
                csv_reader = csv.reader(io.StringIO(file.read().decode("utf-8")))
                for row in csv_reader:
                    csv_text += " ".join(row) + " "
                return csv_text
                
            else:
                st.error(f"Unsupported file type: {file_type}")
                return ""
                
        except Exception as e:
            st.error(f"Error extracting text from file: {str(e)}")
            return ""
    
    def get_embedding(self, text):
        """Generate text embedding for a given text"""
        if self.model is None:
            st.error("Embedding model not available. Please check your environment.")
            return np.zeros(768)  # Default embedding size as fallback
            
        try:
            # For long texts, split into smaller chunks to avoid OOM
            max_length = 256  # Reduced from default 512 to save memory
            
            # Truncate text and tokenize
            inputs = self.tokenizer(
                text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=max_length, 
                padding=True
            )
            
            # Move inputs to same device as model
            device = next(self.model.parameters()).device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Free up memory before inference
            torch.cuda.empty_cache()
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Use [CLS] token embedding or mean pooling based on model architecture
            if hasattr(outputs, "last_hidden_state"):
                # Mean pooling across token dimension
                embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
                embedding_np = embeddings.detach().cpu().numpy()
                
                # Set embedding size if not set
                if self.embedding_size is None:
                    self.embedding_size = embedding_np.shape[0]
                    
                # Clear cache after getting embedding
                del outputs, embeddings
                torch.cuda.empty_cache()
                    
                return embedding_np
            else:
                # For models that return a specific embedding
                embedding_np = outputs.detach().cpu().numpy()
                
                # Set embedding size if not set
                if self.embedding_size is None:
                    self.embedding_size = embedding_np.shape[0]
                    
                # Clear cache after getting embedding
                del outputs
                torch.cuda.empty_cache()
                    
                return embedding_np
        except Exception as e:
            st.error(f"Error generating embedding: {str(e)}")
            torch.cuda.empty_cache()  # Try to recover memory
            return np.zeros(768)  # Default embedding size as fallback
    
    def create_faiss_index(self, embeddings):
        """Create a FAISS index for fast similarity search"""
        # Get the dimension of the embeddings
        dimension = embeddings[0].shape[0]
        
        # Create a FAISS index
        index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity with normalized vectors
        
        # Add normalized vectors to the index
        embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings])
        index.add(embeddings_normalized)
        
        return index
    
    def query_faiss_index(self, index, query_embedding, k=10):
        """Query the FAISS index with a query embedding"""
        # Normalize query embedding
        query_embedding = query_embedding / np.linalg.norm(query_embedding)
        
        # Reshape to a row vector if needed
        if len(query_embedding.shape) == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        # Query the index
        scores, indices = index.search(query_embedding, k)
        
        return scores[0], indices[0]  # Return the scores and indices as flat arrays
    
    def calculate_bm25_scores(self, resume_texts, job_description):
        """Calculate BM25 scores for keyword matching"""
        # Tokenize job description
        job_tokens = word_tokenize(job_description.lower())
        
        # Prepare corpus from resumes
        corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
        
        # Initialize BM25
        bm25 = BM25Okapi(corpus)
        
        # Calculate scores
        scores = bm25.get_scores(job_tokens)
        
        return scores
    
    def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
        """Calculate hybrid scores combining semantic similarity and BM25"""
        # Calculate semantic similarity scores (cosine similarity)
        if use_faiss and len(resume_embeddings) > 10:
            # Create FAISS index if not already created
            if st.session_state.faiss_index is None:
                index = self.create_faiss_index(resume_embeddings)
                st.session_state.faiss_index = index
            else:
                index = st.session_state.faiss_index
            
            # Query index with job embedding
            faiss_scores, faiss_indices = self.query_faiss_index(index, job_embedding, k=len(resume_embeddings))
            
            # Create full semantic scores array
            semantic_scores = np.zeros(len(resume_embeddings))
            for i, idx in enumerate(faiss_indices):
                if idx < len(resume_embeddings):
                    semantic_scores[idx] = faiss_scores[i]
        else:
            # Direct cosine similarity calculation for smaller datasets
            semantic_scores = []
            for emb in resume_embeddings:
                # Normalize the embeddings for cosine similarity
                emb_norm = emb / np.linalg.norm(emb)
                job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
                
                # Calculate cosine similarity
                similarity = np.dot(emb_norm, job_emb_norm)
                semantic_scores.append(similarity)
        
        # Calculate BM25 scores
        bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
        
        # Normalize BM25 scores
        if max(bm25_scores) > 0:
            bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
        
        # Calculate hybrid scores
        keyword_weight = 1.0 - semantic_weight
        hybrid_scores = [
            (semantic_weight * sem_score) + (keyword_weight * bm25_score)
            for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
        ]
        
        return hybrid_scores, semantic_scores, bm25_scores
    
    def extract_skills(self, text, job_description):
        """Extract skills from text based on job description"""
        # Simple skill extraction using regex and job description keywords
        # In a real implementation, this could be enhanced with ML-based skill extraction
        
        # Extract potential skills from job description (words 3 letters or longer)
        potential_skills = set()
        
        # Common skill-related phrases that might appear in job descriptions
        skill_indicators = ["experience with", "knowledge of", "familiar with", "proficient in", 
                           "skills in", "expertise in", "background in", "capabilities in", 
                           "years of experience in", "understanding of", "trained in"]
        
        # Extract skills from sentences containing skill indicators
        sentences = sent_tokenize(job_description)
        for sentence in sentences:
            sentence_lower = sentence.lower()
            for indicator in skill_indicators:
                if indicator in sentence_lower:
                    # Extract words after the indicator, possibly until end of sentence or punctuation
                    skills_part = sentence_lower.split(indicator, 1)[1]
                    
                    # Extract words, cleaning up symbols
                    words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part)
                    for word in words:
                        if len(word) >= 3:  # Only consider words 3 letters or longer
                            potential_skills.add(word.lower())
        
        # Add explicit skills - look for comma-separated lists or bullet points
        skill_lists = re.findall(r'(?:skills|requirements|qualifications)[^\n.]*?:(.+?)(?:\n|$)', job_description.lower())
        for skill_list in skill_lists:
            words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list)
            for word in words:
                if len(word) >= 3:
                    potential_skills.add(word.lower())
        
        # Add common tech skills if they appear in the job description
        common_tech_skills = ["python", "java", "c++", "javascript", "sql", "react", "node.js", "typescript",
                              "html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "terraform",
                              "git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"]
        
        for skill in common_tech_skills:
            if skill in job_description.lower():
                potential_skills.add(skill)
        
        # Find skills in the resume
        matched_skills = []
        for skill in potential_skills:
            # Make it a word boundary search with regex
            pattern = r'\b' + re.escape(skill) + r'\b'
            matches = re.findall(pattern, text.lower())
            if matches:
                matched_skills.append(skill)
        
        return list(set(matched_skills))
    
    def extract_key_phrases(self, text, job_description):
        """Extract key phrases from text that match job description keywords"""
        # Identify job skills first
        skills = self.extract_skills(job_description, job_description)
        
        # Extract sentences that contain skills
        sentences = sent_tokenize(text)
        skill_sentences = []
        
        for sentence in sentences:
            sentence_lower = sentence.lower()
            for skill in skills:
                if skill in sentence_lower:
                    # Append the sentence with the skill highlighted
                    highlighted = sentence.replace(skill, f"**{skill}**")
                    skill_sentences.append(highlighted)
                    break
        
        # Get additional generic matches if we don't have enough skill sentences
        if len(skill_sentences) < 5:
            # Simple extraction based on job description keywords
            job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3)
            text_tokens = word_tokenize(text)
            
            matches = []
            for i, token in enumerate(text_tokens):
                if token.lower() in job_tokens:
                    # Get a phrase context (5 words before and after)
                    start = max(0, i - 5)
                    end = min(len(text_tokens), i + 6)
                    phrase = " ".join(text_tokens[start:end])
                    matches.append(phrase)
            
            # Add unique phrases to complement skill sentences
            unique_matches = list(set(matches))
            skill_sentences.extend(unique_matches[:5 - len(skill_sentences)])
        
        # Return unique phrases, up to 5
        return skill_sentences[:5]
    
    def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
        """Generate explanation for why a resume was ranked highly using QwQ-32B model"""
        # Use the explanation generator if available
        if use_explanation and self.explanation_generator:
            return self.explanation_generator.generate_explanation(
                resume_text, 
                job_description, 
                score, 
                semantic_score, 
                bm25_score,
                skills
            )
        else:
            # Fallback to simple explanation
            matching_phrases = self.extract_key_phrases(resume_text, job_description)
            
            explanation = f"This resume received a score of {score:.2f}, with semantic relevance of {semantic_score:.2f} and keyword match of {bm25_score:.2f}. "
            
            if skills:
                explanation += f"The resume shows experience with key skills: {', '.join(skills[:5])}. "
            
            if matching_phrases:
                explanation += f"Key matching elements include: {matching_phrases[0]}"
            
            return explanation

# Function to create a download link for dataframe as CSV
def get_csv_download_link(df, filename="results.csv"):
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
    return href

# Add this new function after the get_csv_download_link function
def get_huggingface_spaces_datasets():
    """Check for datasets in Hugging Face Spaces environment"""
    datasets = []
    
    # Common dataset paths in Hugging Face Spaces
    potential_paths = [
        "/data",                 # Common mount point
        "data",                  # Relative path
        os.path.expanduser("~/data"),  # Home directory
    ]
    
    for path in potential_paths:
        if os.path.exists(path) and os.path.isdir(path):
            # Look for CSV files
            csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
            for csv_file in csv_files:
                datasets.append(os.path.join(path, csv_file))
            
            # Look for directories that might contain PDFs
            for subdir in os.listdir(path):
                subdir_path = os.path.join(path, subdir)
                if os.path.isdir(subdir_path):
                    pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
                    if pdf_count > 0:
                        datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
    
    return datasets

# Main app UI
st.title("Resume Screener & Skill Extractor")
st.markdown("---")

# Initialize the resume screener
screener = ResumeScreener(embedding_model_name, explanation_model_name)

# Job description input
st.header("1. Enter Job Description")
job_description = st.text_area(
    "Paste the job description or requirements here:", 
    height=200,
    help="Enter the complete job description or a list of required skills and qualifications."
)

# Resume upload
st.header("2. Upload Resumes")
upload_option = st.radio(
    "Choose upload method:",
    ["Upload Files", "Upload from Dataset", "Process Directory"]
)

uploaded_files = []
resume_texts = []
file_names = []

if upload_option == "Upload Files":
    uploaded_files = st.file_uploader(
        "Upload resume files",
        type=["pdf", "docx", "txt", "csv"], 
        accept_multiple_files=True,
        help="Upload multiple resume files in PDF, DOCX, TXT, or CSV format."
    )
    
    if uploaded_files:
        with st.spinner("Processing resumes..."):
            for file in uploaded_files:
                file_type = file.name.split('.')[-1].lower()
                
                with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
                    tmp_file.write(file.getvalue())
                    tmp_path = tmp_file.name
                
                text = screener.extract_text_from_file(tmp_path, file_type)
                if text:
                    resume_texts.append(text)
                    file_names.append(file.name)
                
                # Clean up temp file
                os.unlink(tmp_path)
                
        st.session_state.resumes_uploaded = True
        st.success(f"Successfully processed {len(resume_texts)} resumes.")
elif upload_option == "Process Directory":
    st.write("Process resume files from a directory on the server.")
    
    # Input for directory path
    resume_dir = st.text_input(
        "Enter the path to the directory containing resume files:",
        help="For Hugging Face Spaces, this could be a mounted directory or dataset."
    )
    
    # Limit batch size
    batch_size = st.number_input(
        "Number of files to process per batch (lower for less memory usage):",
        min_value=10,
        max_value=1000,
        value=100,
        step=10
    )
    
    # File types to process
    file_types = st.multiselect(
        "Select file types to process:",
        ["pdf", "docx", "txt", "csv"],
        default=["pdf"]
    )
    
    if resume_dir and st.button("Process Directory"):
        if os.path.isdir(resume_dir):
            # Get all files matching the selected types
            all_files = []
            for file_type in file_types:
                all_files.extend([
                    os.path.join(resume_dir, f) 
                    for f in os.listdir(resume_dir) 
                    if f.lower().endswith(f'.{file_type}')
                ])
            
            if all_files:
                total_files = len(all_files)
                st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")
                
                # Process in batches
                processed_count = 0
                progress_bar = st.progress(0)
                status_text = st.empty()
                
                for i in range(0, total_files, batch_size):
                    batch_files = all_files[i:i+batch_size]
                    
                    for j, file_path in enumerate(batch_files):
                        try:
                            file_type = file_path.split('.')[-1].lower()
                            text = screener.extract_text_from_file(file_path, file_type)
                            if text:
                                resume_texts.append(text)
                                file_names.append(os.path.basename(file_path))
                                processed_count += 1
                                
                                # Apply memory optimization if enabled
                                if memory_optimization and j % gc_collect_interval == 0 and j > 0:
                                    import gc
                                    gc.collect()
                                    status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
                        except Exception as e:
                            st.warning(f"Error processing {file_path}: {str(e)}")
                    
                    # Update progress
                    progress = min(1.0, (i + len(batch_files)) / total_files)
                    progress_bar.progress(progress)
                    status_text.text(f"Processed {processed_count}/{total_files} files...")
                    
                    # Run garbage collection between batches if memory optimization is enabled
                    if memory_optimization:
                        import gc
                        gc.collect()
                
                # Final garbage collection if memory optimization is enabled
                if memory_optimization:
                    import gc
                    gc.collect()
                
                st.session_state.resumes_uploaded = True
                st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
            else:
                st.error(f"No matching files found in {resume_dir}")
        else:
            st.error(f"Directory {resume_dir} does not exist or is not accessible.")
elif upload_option == "Upload from Dataset":
    # Upload from Dataset implementation
    st.write("Upload a CSV file containing resume data or load from available datasets.")
    
    # Check for available datasets in Hugging Face Spaces
    hf_datasets = get_huggingface_spaces_datasets()
    
    if hf_datasets:
        st.subheader("Available Datasets in Hugging Face Spaces")
        dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
        selected_dataset = st.selectbox("Select a dataset:", dataset_options)
        
        if selected_dataset != "None":
            selected_index = dataset_options.index(selected_dataset) - 1  # Adjust for "None"
            dataset_path = hf_datasets[selected_index]
            
            if isinstance(dataset_path, tuple):
                # It's a PDF directory
                pdf_dir = dataset_path[0]
                st.write(f"Selected PDF directory: {pdf_dir}")
                
                batch_size = st.number_input(
                    "Number of files to process per batch:",
                    min_value=10,
                    max_value=1000,
                    value=100,
                    step=10
                )
                
                if st.button("Process PDF Directory"):
                    # Use the same processing logic as in the "Process Directory" option
                    if os.path.isdir(pdf_dir):
                        all_files = [
                            os.path.join(pdf_dir, f) 
                            for f in os.listdir(pdf_dir) 
                            if f.lower().endswith('.pdf')
                        ]
                        
                        if all_files:
                            total_files = len(all_files)
                            st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
                            
                            # Process in batches
                            processed_count = 0
                            progress_bar = st.progress(0)
                            status_text = st.empty()
                            
                            for i in range(0, total_files, batch_size):
                                batch_files = all_files[i:i+batch_size]
                                
                                for j, file_path in enumerate(batch_files):
                                    try:
                                        text = screener.extract_text_from_file(file_path, "pdf")
                                        if text:
                                            resume_texts.append(text)
                                            file_names.append(os.path.basename(file_path))
                                            processed_count += 1
                                            
                                            # Apply memory optimization if enabled
                                            if memory_optimization and j % gc_collect_interval == 0 and j > 0:
                                                import gc
                                                gc.collect()
                                    except Exception as e:
                                        st.warning(f"Error processing {file_path}: {str(e)}")
                                
                                # Update progress
                                progress = min(1.0, (i + len(batch_files)) / total_files)
                                progress_bar.progress(progress)
                                status_text.text(f"Processed {processed_count}/{total_files} files...")
                                
                                # Memory optimization
                                if memory_optimization:
                                    import gc
                                    gc.collect()
                            
                            st.session_state.resumes_uploaded = True
                            st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
            else:
                # It's a CSV file
                st.write(f"Selected CSV dataset: {dataset_path}")
                
                try:
                    # Read the CSV file
                    df = pd.read_csv(dataset_path)
                    
                    # Let user select which column contains the resume text
                    text_column = st.selectbox(
                        "Select column containing resume text:",
                        df.columns.tolist()
                    )
                    
                    if st.button("Process Selected CSV"):
                        # Extract text from the selected column
                        for i, row in df.iterrows():
                            text = str(row[text_column])
                            if text and not pd.isna(text):
                                resume_texts.append(text)
                                # Use index as filename if no filename column
                                file_name = f"resume_{i}.txt"
                                if 'filename' in df.columns:
                                    file_name = row['filename']
                                file_names.append(file_name)
                        
                        st.session_state.resumes_uploaded = True
                        st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
                except Exception as e:
                    st.error(f"Error processing CSV: {str(e)}")
    
    # Rest of the existing Upload from Dataset code
    dataset_option = st.radio(
        "Dataset source:",
        ["Upload CSV", "Use Hugging Face Dataset"]
    )
    
    if dataset_option == "Upload CSV":
        csv_file = st.file_uploader(
            "Upload CSV file containing resume data",
            type=["csv"],
            help="CSV should contain at least a column with resume text."
        )
        
        if csv_file:
            with st.spinner("Processing CSV data..."):
                # Read the CSV file
                df = pd.read_csv(csv_file)
                
                # Let user select which column contains the resume text
                text_column = st.selectbox(
                    "Select column containing resume text:",
                    df.columns.tolist()
                )
                
                if st.button("Process Dataset"):
                    # Extract text from the selected column
                    for i, row in df.iterrows():
                        text = str(row[text_column])
                        if text and not pd.isna(text):
                            resume_texts.append(text)
                            # Use index as filename if no filename column
                            file_name = f"resume_{i}.txt"
                            if 'filename' in df.columns:
                                file_name = row['filename']
                            file_names.append(file_name)
                    
                    st.session_state.resumes_uploaded = True
                    st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
    else:
        # Hugging Face Dataset option
        dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
        split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
        
        if dataset_name and st.button("Load Dataset"):
            with st.spinner(f"Loading dataset {dataset_name}..."):
                try:
                    from datasets import load_dataset
                    
                    # Load the dataset
                    dataset = load_dataset(dataset_name, split=split)
                    
                    # Display dataset info
                    st.write(f"Dataset loaded with {len(dataset)} entries.")
                    
                    # Let user select which column contains the resume text
                    if len(dataset.column_names) > 0:
                        text_column = st.selectbox(
                            "Select column containing resume text:",
                            dataset.column_names
                        )
                        
                        if st.button("Process Hugging Face Dataset"):
                            # Extract text from the selected column
                            for i, item in enumerate(dataset):
                                if text_column in item:
                                    text = str(item[text_column])
                                    if text:
                                        resume_texts.append(text)
                                        # Use index or id field as filename
                                        file_name = f"resume_{i}.txt"
                                        if 'id' in item:
                                            file_name = f"resume_{item['id']}.txt"
                                        file_names.append(file_name)
                            
                            st.session_state.resumes_uploaded = True
                            st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
                except Exception as e:
                    st.error(f"Error loading dataset: {str(e)}")
                    st.info("Make sure you have the 'datasets' library installed: pip install datasets")

# Process button
if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
    with st.spinner("Processing job description and resumes..."):
        # Get job description embedding
        job_embedding = screener.get_embedding(job_description)
        
        # Process resumes in batches to avoid OOM
        resume_embeddings = []
        batch_size = 10  # Process 10 resumes at a time
        progress_bar = st.progress(0)
        status_text = st.empty()
        
        for i in range(0, len(resume_texts), batch_size):
            batch = resume_texts[i:i+batch_size]
            status_text.text(f"Processing resumes {i+1}-{min(i+batch_size, len(resume_texts))} of {len(resume_texts)}...")
            
            batch_embeddings = []
            for j, text in enumerate(batch):
                embedding = screener.get_embedding(text)
                batch_embeddings.append(embedding)
                # Update progress after each resume
                progress = (i + j + 1) / len(resume_texts)
                progress_bar.progress(progress)
            
            # Add batch embeddings to the full list
            resume_embeddings.extend(batch_embeddings)
            
            # Force garbage collection between batches
            import gc
            gc.collect()
            torch.cuda.empty_cache()
        
        status_text.text("Calculating similarity scores...")
        
        # Calculate hybrid scores
        hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
            resume_texts, 
            resume_embeddings, 
            job_embedding,
            semantic_weight,
            use_faiss
        )
        
        # Get top candidates
        combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
        sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True)
        top_candidates = sorted_data[:int(top_k)]
        
        # Create results with explanations if enabled
        results = []
        status_text.text("Generating explanations...")
        
        for idx, (name, text, score, semantic_score, bm25_score) in enumerate(top_candidates):
            # Extract skills for this resume
            skills = screener.extract_skills(text, job_description)
            
            result = {
                "filename": name,
                "score": score,
                "semantic_score": semantic_score,
                "keyword_score": bm25_score,
                "text_preview": text[:500] + "...",
                "matched_phrases": screener.extract_key_phrases(text, job_description),
                "skills": skills
            }
            
            if use_explanation:
                # Update progress to show explanation generation
                progress_bar.progress((idx + 1) / len(top_candidates))
                status_text.text(f"Generating explanation for candidate {idx+1}/{len(top_candidates)}...")
                
                explanation = screener.generate_explanation(
                    text, 
                    job_description, 
                    score, 
                    semantic_score, 
                    bm25_score,
                    skills
                )
                result["explanation"] = explanation
                
                # Clear cache after each explanation
                torch.cuda.empty_cache()
            else:
                result["explanation"] = ""
                
            results.append(result)
        
        st.session_state.results = results
        st.success(f"Found top {len(results)} candidates!")

# Display results
if st.session_state.results:
    st.header("3. Results")
    
    # Create a DataFrame for download
    df_data = []
    for result in st.session_state.results:
        df_data.append({
            "Filename": result["filename"],
            "Score": result["score"],
            "Semantic Score": result["semantic_score"],
            "Keyword Score": result["keyword_score"],
            "Skills": ", ".join(result["skills"]),
            "Explanation": result["explanation"]
        })
    
    results_df = pd.DataFrame(df_data)
    
    # Display download link
    st.markdown(get_csv_download_link(results_df), unsafe_allow_html=True)
    
    # Display individual results
    for i, result in enumerate(st.session_state.results):
        with st.expander(f"#{i+1}: {result['filename']} (Score: {result['score']:.4f})"):
            col1, col2 = st.columns([1, 1])
            
            with col1:
                st.subheader("Scores")
                st.write(f"Total Score: {result['score']:.4f}")
                st.write(f"Semantic Score: {result['semantic_score']:.4f}")
                st.write(f"Keyword Score: {result['keyword_score']:.4f}")
                
                st.subheader("Matched Skills")
                if result["skills"]:
                    for skill in result["skills"]:
                        st.write(f"• {skill}")
                else:
                    st.write("No specific skills matched.")
            
            with col2:
                st.subheader("Explanation")
                st.write(result["explanation"])
                
                st.subheader("Key Matches")
                for phrase in result["matched_phrases"]:
                    st.markdown(f"• {phrase}")
                
                st.subheader("Resume Preview")
                st.text_area("", result["text_preview"], height=150, disabled=True)
    
    # Visualization of scores
    st.subheader("Score Comparison")
    
    # Prepare data for visualization
    chart_data = pd.DataFrame({
        "Resume": [result["filename"] for result in st.session_state.results],
        "Semantic Score": [result["semantic_score"] for result in st.session_state.results],
        "Keyword Score": [result["keyword_score"] for result in st.session_state.results],
        "Total Score": [result["score"] for result in st.session_state.results]
    })
    
    # Display as a bar chart
    st.bar_chart(chart_data.set_index("Resume")[["Total Score", "Semantic Score", "Keyword Score"]])

# Footer
st.markdown("---")
st.markdown("Built with Streamlit and Hugging Face models (NV-Embed-v2 and QwQ-32B)")