import streamlit as st import pdfplumber import pandas as pd import numpy as np import torch import nltk import faiss import os import tempfile import base64 from rank_bm25 import BM25Okapi from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig from sentence_transformers import SentenceTransformer from nltk.tokenize import word_tokenize, sent_tokenize from tqdm import tqdm import re import io import PyPDF2 from docx import Document import csv from explanation_generator import ExplanationGenerator # Download NLTK resources try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') # Initialize embedding model at startup EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2" print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...") try: # Configure 4-bit quantization for better memory efficiency quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True ) # Load embedding model and tokenizer with 4-bit quantization global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True) global_embedding_model = AutoModel.from_pretrained( EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto", quantization_config=quantization_config, torch_dtype=torch.float16 ) print(f"Successfully loaded {EMBEDDING_MODEL_NAME} with 4-bit quantization") except Exception as e: print(f"Error loading embedding model: {str(e)}") global_embedding_tokenizer = None global_embedding_model = None # Set page configuration st.set_page_config( page_title="Resume Screener & Skill Extractor", page_icon="📄", layout="wide", initial_sidebar_state="expanded" ) # Sidebar for model selection and weights with st.sidebar: st.title("Configuration") # Model selection embedding_model_name = st.selectbox( "Embedding Model", ["nvidia/NV-Embed-v2"], index=0 ) explanation_model_name = st.selectbox( "Explanation Model", ["Qwen/QwQ-32B"], index=0 ) # Ranking weights st.subheader("Ranking Weights") semantic_weight = st.slider("Semantic Similarity Weight", 0.0, 1.0, 0.7, 0.1) keyword_weight = 1.0 - semantic_weight st.write(f"Keyword Weight: {keyword_weight:.1f}") # Advanced options st.subheader("Advanced Options") top_k = st.number_input("Number of results to display", min_value=1, max_value=20, value=10, step=1) use_explanation = st.checkbox("Generate Explanations", value=True) use_faiss = st.checkbox("Use FAISS for fast search", value=True) # Memory optimization options st.subheader("Memory Optimization") memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False) clear_embeddings = st.checkbox("Clear embeddings after processing", value=False) gc_collect_interval = st.number_input( "Garbage collection interval (files)", min_value=10, max_value=1000, value=100, step=10, help="Run garbage collection after processing this many files" ) st.markdown("---") st.markdown("### About") st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.") # Initialize session state variables if 'resumes_uploaded' not in st.session_state: st.session_state.resumes_uploaded = False if 'job_description' not in st.session_state: st.session_state.job_description = "" if 'results' not in st.session_state: st.session_state.results = [] if 'embedding_model' not in st.session_state: st.session_state.embedding_model = global_embedding_model if 'tokenizer' not in st.session_state: st.session_state.tokenizer = global_embedding_tokenizer if 'faiss_index' not in st.session_state: st.session_state.faiss_index = None if 'explanation_generator' not in st.session_state: st.session_state.explanation_generator = None class ResumeScreener: def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"): """Initialize the ResumeScreener with the specified embedding model""" self.embedding_model_name = embedding_model_name self.explanation_model_name = explanation_model_name # Initialize with preloaded models self.model = st.session_state.embedding_model self.tokenizer = st.session_state.tokenizer self.faiss_index = None self.embedding_size = None self.explanation_generator = None # Initialize explanation generator if use_explanation and st.session_state.explanation_generator is None: with st.spinner("Initializing explanation generator..."): st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name) self.explanation_generator = st.session_state.explanation_generator elif use_explanation: self.explanation_generator = st.session_state.explanation_generator def extract_text_from_file(self, file, file_type): """Extract text from various file types""" try: if file_type == "pdf": # Use pdfplumber for better text extraction with pdfplumber.open(file) as pdf: text = "" for page in pdf.pages: text += page.extract_text() or "" # If pdfplumber fails, try PyPDF2 as fallback if not text.strip(): reader = PyPDF2.PdfReader(file) text = "" for page_num in range(len(reader.pages)): page = reader.pages[page_num] text += page.extract_text() or "" return text elif file_type == "docx": doc = Document(file) return " ".join([paragraph.text for paragraph in doc.paragraphs]) elif file_type == "txt": return file.read().decode("utf-8") elif file_type == "csv": csv_text = "" csv_reader = csv.reader(io.StringIO(file.read().decode("utf-8"))) for row in csv_reader: csv_text += " ".join(row) + " " return csv_text else: st.error(f"Unsupported file type: {file_type}") return "" except Exception as e: st.error(f"Error extracting text from file: {str(e)}") return "" def get_embedding(self, text): """Generate text embedding for a given text""" if self.model is None: st.error("Embedding model not available. Please check your environment.") return np.zeros(768) # Default embedding size as fallback try: # For long texts, split into smaller chunks to avoid OOM max_length = 256 # Reduced from default 512 to save memory # Truncate text and tokenize inputs = self.tokenizer( text, return_tensors="pt", truncation=True, max_length=max_length, padding=True ) # Move inputs to same device as model device = next(self.model.parameters()).device inputs = {k: v.to(device) for k, v in inputs.items()} # Free up memory before inference torch.cuda.empty_cache() with torch.no_grad(): outputs = self.model(**inputs) # Use [CLS] token embedding or mean pooling based on model architecture if hasattr(outputs, "last_hidden_state"): # Mean pooling across token dimension embeddings = outputs.last_hidden_state.mean(dim=1).squeeze() embedding_np = embeddings.detach().cpu().numpy() # Set embedding size if not set if self.embedding_size is None: self.embedding_size = embedding_np.shape[0] # Clear cache after getting embedding del outputs, embeddings torch.cuda.empty_cache() return embedding_np else: # For models that return a specific embedding embedding_np = outputs.detach().cpu().numpy() # Set embedding size if not set if self.embedding_size is None: self.embedding_size = embedding_np.shape[0] # Clear cache after getting embedding del outputs torch.cuda.empty_cache() return embedding_np except Exception as e: st.error(f"Error generating embedding: {str(e)}") torch.cuda.empty_cache() # Try to recover memory return np.zeros(768) # Default embedding size as fallback def create_faiss_index(self, embeddings): """Create a FAISS index for fast similarity search""" # Get the dimension of the embeddings dimension = embeddings[0].shape[0] # Create a FAISS index index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity with normalized vectors # Add normalized vectors to the index embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings]) index.add(embeddings_normalized) return index def query_faiss_index(self, index, query_embedding, k=10): """Query the FAISS index with a query embedding""" # Normalize query embedding query_embedding = query_embedding / np.linalg.norm(query_embedding) # Reshape to a row vector if needed if len(query_embedding.shape) == 1: query_embedding = query_embedding.reshape(1, -1) # Query the index scores, indices = index.search(query_embedding, k) return scores[0], indices[0] # Return the scores and indices as flat arrays def calculate_bm25_scores(self, resume_texts, job_description): """Calculate BM25 scores for keyword matching""" # Tokenize job description job_tokens = word_tokenize(job_description.lower()) # Prepare corpus from resumes corpus = [word_tokenize(resume.lower()) for resume in resume_texts] # Initialize BM25 bm25 = BM25Okapi(corpus) # Calculate scores scores = bm25.get_scores(job_tokens) return scores def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True): """Calculate hybrid scores combining semantic similarity and BM25""" # Calculate semantic similarity scores (cosine similarity) if use_faiss and len(resume_embeddings) > 10: # Create FAISS index if not already created if st.session_state.faiss_index is None: index = self.create_faiss_index(resume_embeddings) st.session_state.faiss_index = index else: index = st.session_state.faiss_index # Query index with job embedding faiss_scores, faiss_indices = self.query_faiss_index(index, job_embedding, k=len(resume_embeddings)) # Create full semantic scores array semantic_scores = np.zeros(len(resume_embeddings)) for i, idx in enumerate(faiss_indices): if idx < len(resume_embeddings): semantic_scores[idx] = faiss_scores[i] else: # Direct cosine similarity calculation for smaller datasets semantic_scores = [] for emb in resume_embeddings: # Normalize the embeddings for cosine similarity emb_norm = emb / np.linalg.norm(emb) job_emb_norm = job_embedding / np.linalg.norm(job_embedding) # Calculate cosine similarity similarity = np.dot(emb_norm, job_emb_norm) semantic_scores.append(similarity) # Calculate BM25 scores bm25_scores = self.calculate_bm25_scores(resume_texts, job_description) # Normalize BM25 scores if max(bm25_scores) > 0: bm25_scores = [score / max(bm25_scores) for score in bm25_scores] # Calculate hybrid scores keyword_weight = 1.0 - semantic_weight hybrid_scores = [ (semantic_weight * sem_score) + (keyword_weight * bm25_score) for sem_score, bm25_score in zip(semantic_scores, bm25_scores) ] return hybrid_scores, semantic_scores, bm25_scores def extract_skills(self, text, job_description): """Extract skills from text based on job description""" # Simple skill extraction using regex and job description keywords # In a real implementation, this could be enhanced with ML-based skill extraction # Extract potential skills from job description (words 3 letters or longer) potential_skills = set() # Common skill-related phrases that might appear in job descriptions skill_indicators = ["experience with", "knowledge of", "familiar with", "proficient in", "skills in", "expertise in", "background in", "capabilities in", "years of experience in", "understanding of", "trained in"] # Extract skills from sentences containing skill indicators sentences = sent_tokenize(job_description) for sentence in sentences: sentence_lower = sentence.lower() for indicator in skill_indicators: if indicator in sentence_lower: # Extract words after the indicator, possibly until end of sentence or punctuation skills_part = sentence_lower.split(indicator, 1)[1] # Extract words, cleaning up symbols words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part) for word in words: if len(word) >= 3: # Only consider words 3 letters or longer potential_skills.add(word.lower()) # Add explicit skills - look for comma-separated lists or bullet points skill_lists = re.findall(r'(?:skills|requirements|qualifications)[^\n.]*?:(.+?)(?:\n|$)', job_description.lower()) for skill_list in skill_lists: words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list) for word in words: if len(word) >= 3: potential_skills.add(word.lower()) # Add common tech skills if they appear in the job description common_tech_skills = ["python", "java", "c++", "javascript", "sql", "react", "node.js", "typescript", "html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "terraform", "git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"] for skill in common_tech_skills: if skill in job_description.lower(): potential_skills.add(skill) # Find skills in the resume matched_skills = [] for skill in potential_skills: # Make it a word boundary search with regex pattern = r'\b' + re.escape(skill) + r'\b' matches = re.findall(pattern, text.lower()) if matches: matched_skills.append(skill) return list(set(matched_skills)) def extract_key_phrases(self, text, job_description): """Extract key phrases from text that match job description keywords""" # Identify job skills first skills = self.extract_skills(job_description, job_description) # Extract sentences that contain skills sentences = sent_tokenize(text) skill_sentences = [] for sentence in sentences: sentence_lower = sentence.lower() for skill in skills: if skill in sentence_lower: # Append the sentence with the skill highlighted highlighted = sentence.replace(skill, f"**{skill}**") skill_sentences.append(highlighted) break # Get additional generic matches if we don't have enough skill sentences if len(skill_sentences) < 5: # Simple extraction based on job description keywords job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3) text_tokens = word_tokenize(text) matches = [] for i, token in enumerate(text_tokens): if token.lower() in job_tokens: # Get a phrase context (5 words before and after) start = max(0, i - 5) end = min(len(text_tokens), i + 6) phrase = " ".join(text_tokens[start:end]) matches.append(phrase) # Add unique phrases to complement skill sentences unique_matches = list(set(matches)) skill_sentences.extend(unique_matches[:5 - len(skill_sentences)]) # Return unique phrases, up to 5 return skill_sentences[:5] def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills): """Generate explanation for why a resume was ranked highly using QwQ-32B model""" # Use the explanation generator if available if use_explanation and self.explanation_generator: return self.explanation_generator.generate_explanation( resume_text, job_description, score, semantic_score, bm25_score, skills ) else: # Fallback to simple explanation matching_phrases = self.extract_key_phrases(resume_text, job_description) explanation = f"This resume received a score of {score:.2f}, with semantic relevance of {semantic_score:.2f} and keyword match of {bm25_score:.2f}. " if skills: explanation += f"The resume shows experience with key skills: {', '.join(skills[:5])}. " if matching_phrases: explanation += f"Key matching elements include: {matching_phrases[0]}" return explanation # Function to create a download link for dataframe as CSV def get_csv_download_link(df, filename="results.csv"): csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download CSV' return href # Add this new function after the get_csv_download_link function def get_huggingface_spaces_datasets(): """Check for datasets in Hugging Face Spaces environment""" datasets = [] # Common dataset paths in Hugging Face Spaces potential_paths = [ "/data", # Common mount point "data", # Relative path os.path.expanduser("~/data"), # Home directory ] for path in potential_paths: if os.path.exists(path) and os.path.isdir(path): # Look for CSV files csv_files = [f for f in os.listdir(path) if f.endswith('.csv')] for csv_file in csv_files: datasets.append(os.path.join(path, csv_file)) # Look for directories that might contain PDFs for subdir in os.listdir(path): subdir_path = os.path.join(path, subdir) if os.path.isdir(subdir_path): pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')]) if pdf_count > 0: datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)")) return datasets # Main app UI st.title("Resume Screener & Skill Extractor") st.markdown("---") # Initialize the resume screener screener = ResumeScreener(embedding_model_name, explanation_model_name) # Job description input st.header("1. Enter Job Description") job_description = st.text_area( "Paste the job description or requirements here:", height=200, help="Enter the complete job description or a list of required skills and qualifications." ) # Resume upload st.header("2. Upload Resumes") upload_option = st.radio( "Choose upload method:", ["Upload Files", "Upload from Dataset", "Process Directory"] ) uploaded_files = [] resume_texts = [] file_names = [] if upload_option == "Upload Files": uploaded_files = st.file_uploader( "Upload resume files", type=["pdf", "docx", "txt", "csv"], accept_multiple_files=True, help="Upload multiple resume files in PDF, DOCX, TXT, or CSV format." ) if uploaded_files: with st.spinner("Processing resumes..."): for file in uploaded_files: file_type = file.name.split('.')[-1].lower() with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file: tmp_file.write(file.getvalue()) tmp_path = tmp_file.name text = screener.extract_text_from_file(tmp_path, file_type) if text: resume_texts.append(text) file_names.append(file.name) # Clean up temp file os.unlink(tmp_path) st.session_state.resumes_uploaded = True st.success(f"Successfully processed {len(resume_texts)} resumes.") elif upload_option == "Process Directory": st.write("Process resume files from a directory on the server.") # Input for directory path resume_dir = st.text_input( "Enter the path to the directory containing resume files:", help="For Hugging Face Spaces, this could be a mounted directory or dataset." ) # Limit batch size batch_size = st.number_input( "Number of files to process per batch (lower for less memory usage):", min_value=10, max_value=1000, value=100, step=10 ) # File types to process file_types = st.multiselect( "Select file types to process:", ["pdf", "docx", "txt", "csv"], default=["pdf"] ) if resume_dir and st.button("Process Directory"): if os.path.isdir(resume_dir): # Get all files matching the selected types all_files = [] for file_type in file_types: all_files.extend([ os.path.join(resume_dir, f) for f in os.listdir(resume_dir) if f.lower().endswith(f'.{file_type}') ]) if all_files: total_files = len(all_files) st.write(f"Found {total_files} files. Processing in batches of {batch_size}...") # Process in batches processed_count = 0 progress_bar = st.progress(0) status_text = st.empty() for i in range(0, total_files, batch_size): batch_files = all_files[i:i+batch_size] for j, file_path in enumerate(batch_files): try: file_type = file_path.split('.')[-1].lower() text = screener.extract_text_from_file(file_path, file_type) if text: resume_texts.append(text) file_names.append(os.path.basename(file_path)) processed_count += 1 # Apply memory optimization if enabled if memory_optimization and j % gc_collect_interval == 0 and j > 0: import gc gc.collect() status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)") except Exception as e: st.warning(f"Error processing {file_path}: {str(e)}") # Update progress progress = min(1.0, (i + len(batch_files)) / total_files) progress_bar.progress(progress) status_text.text(f"Processed {processed_count}/{total_files} files...") # Run garbage collection between batches if memory optimization is enabled if memory_optimization: import gc gc.collect() # Final garbage collection if memory optimization is enabled if memory_optimization: import gc gc.collect() st.session_state.resumes_uploaded = True st.success(f"Successfully processed {processed_count} out of {total_files} resume files.") else: st.error(f"No matching files found in {resume_dir}") else: st.error(f"Directory {resume_dir} does not exist or is not accessible.") elif upload_option == "Upload from Dataset": # Upload from Dataset implementation st.write("Upload a CSV file containing resume data or load from available datasets.") # Check for available datasets in Hugging Face Spaces hf_datasets = get_huggingface_spaces_datasets() if hf_datasets: st.subheader("Available Datasets in Hugging Face Spaces") dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets] selected_dataset = st.selectbox("Select a dataset:", dataset_options) if selected_dataset != "None": selected_index = dataset_options.index(selected_dataset) - 1 # Adjust for "None" dataset_path = hf_datasets[selected_index] if isinstance(dataset_path, tuple): # It's a PDF directory pdf_dir = dataset_path[0] st.write(f"Selected PDF directory: {pdf_dir}") batch_size = st.number_input( "Number of files to process per batch:", min_value=10, max_value=1000, value=100, step=10 ) if st.button("Process PDF Directory"): # Use the same processing logic as in the "Process Directory" option if os.path.isdir(pdf_dir): all_files = [ os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf') ] if all_files: total_files = len(all_files) st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...") # Process in batches processed_count = 0 progress_bar = st.progress(0) status_text = st.empty() for i in range(0, total_files, batch_size): batch_files = all_files[i:i+batch_size] for j, file_path in enumerate(batch_files): try: text = screener.extract_text_from_file(file_path, "pdf") if text: resume_texts.append(text) file_names.append(os.path.basename(file_path)) processed_count += 1 # Apply memory optimization if enabled if memory_optimization and j % gc_collect_interval == 0 and j > 0: import gc gc.collect() except Exception as e: st.warning(f"Error processing {file_path}: {str(e)}") # Update progress progress = min(1.0, (i + len(batch_files)) / total_files) progress_bar.progress(progress) status_text.text(f"Processed {processed_count}/{total_files} files...") # Memory optimization if memory_optimization: import gc gc.collect() st.session_state.resumes_uploaded = True st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.") else: # It's a CSV file st.write(f"Selected CSV dataset: {dataset_path}") try: # Read the CSV file df = pd.read_csv(dataset_path) # Let user select which column contains the resume text text_column = st.selectbox( "Select column containing resume text:", df.columns.tolist() ) if st.button("Process Selected CSV"): # Extract text from the selected column for i, row in df.iterrows(): text = str(row[text_column]) if text and not pd.isna(text): resume_texts.append(text) # Use index as filename if no filename column file_name = f"resume_{i}.txt" if 'filename' in df.columns: file_name = row['filename'] file_names.append(file_name) st.session_state.resumes_uploaded = True st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.") except Exception as e: st.error(f"Error processing CSV: {str(e)}") # Rest of the existing Upload from Dataset code dataset_option = st.radio( "Dataset source:", ["Upload CSV", "Use Hugging Face Dataset"] ) if dataset_option == "Upload CSV": csv_file = st.file_uploader( "Upload CSV file containing resume data", type=["csv"], help="CSV should contain at least a column with resume text." ) if csv_file: with st.spinner("Processing CSV data..."): # Read the CSV file df = pd.read_csv(csv_file) # Let user select which column contains the resume text text_column = st.selectbox( "Select column containing resume text:", df.columns.tolist() ) if st.button("Process Dataset"): # Extract text from the selected column for i, row in df.iterrows(): text = str(row[text_column]) if text and not pd.isna(text): resume_texts.append(text) # Use index as filename if no filename column file_name = f"resume_{i}.txt" if 'filename' in df.columns: file_name = row['filename'] file_names.append(file_name) st.session_state.resumes_uploaded = True st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.") else: # Hugging Face Dataset option dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):") split = st.text_input("Enter dataset split (e.g., 'train'):", "train") if dataset_name and st.button("Load Dataset"): with st.spinner(f"Loading dataset {dataset_name}..."): try: from datasets import load_dataset # Load the dataset dataset = load_dataset(dataset_name, split=split) # Display dataset info st.write(f"Dataset loaded with {len(dataset)} entries.") # Let user select which column contains the resume text if len(dataset.column_names) > 0: text_column = st.selectbox( "Select column containing resume text:", dataset.column_names ) if st.button("Process Hugging Face Dataset"): # Extract text from the selected column for i, item in enumerate(dataset): if text_column in item: text = str(item[text_column]) if text: resume_texts.append(text) # Use index or id field as filename file_name = f"resume_{i}.txt" if 'id' in item: file_name = f"resume_{item['id']}.txt" file_names.append(file_name) st.session_state.resumes_uploaded = True st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.") except Exception as e: st.error(f"Error loading dataset: {str(e)}") st.info("Make sure you have the 'datasets' library installed: pip install datasets") # Process button if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)): with st.spinner("Processing job description and resumes..."): # Get job description embedding job_embedding = screener.get_embedding(job_description) # Process resumes in batches to avoid OOM resume_embeddings = [] batch_size = 10 # Process 10 resumes at a time progress_bar = st.progress(0) status_text = st.empty() for i in range(0, len(resume_texts), batch_size): batch = resume_texts[i:i+batch_size] status_text.text(f"Processing resumes {i+1}-{min(i+batch_size, len(resume_texts))} of {len(resume_texts)}...") batch_embeddings = [] for j, text in enumerate(batch): embedding = screener.get_embedding(text) batch_embeddings.append(embedding) # Update progress after each resume progress = (i + j + 1) / len(resume_texts) progress_bar.progress(progress) # Add batch embeddings to the full list resume_embeddings.extend(batch_embeddings) # Force garbage collection between batches import gc gc.collect() torch.cuda.empty_cache() status_text.text("Calculating similarity scores...") # Calculate hybrid scores hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores( resume_texts, resume_embeddings, job_embedding, semantic_weight, use_faiss ) # Get top candidates combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores)) sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True) top_candidates = sorted_data[:int(top_k)] # Create results with explanations if enabled results = [] status_text.text("Generating explanations...") for idx, (name, text, score, semantic_score, bm25_score) in enumerate(top_candidates): # Extract skills for this resume skills = screener.extract_skills(text, job_description) result = { "filename": name, "score": score, "semantic_score": semantic_score, "keyword_score": bm25_score, "text_preview": text[:500] + "...", "matched_phrases": screener.extract_key_phrases(text, job_description), "skills": skills } if use_explanation: # Update progress to show explanation generation progress_bar.progress((idx + 1) / len(top_candidates)) status_text.text(f"Generating explanation for candidate {idx+1}/{len(top_candidates)}...") explanation = screener.generate_explanation( text, job_description, score, semantic_score, bm25_score, skills ) result["explanation"] = explanation # Clear cache after each explanation torch.cuda.empty_cache() else: result["explanation"] = "" results.append(result) st.session_state.results = results st.success(f"Found top {len(results)} candidates!") # Display results if st.session_state.results: st.header("3. Results") # Create a DataFrame for download df_data = [] for result in st.session_state.results: df_data.append({ "Filename": result["filename"], "Score": result["score"], "Semantic Score": result["semantic_score"], "Keyword Score": result["keyword_score"], "Skills": ", ".join(result["skills"]), "Explanation": result["explanation"] }) results_df = pd.DataFrame(df_data) # Display download link st.markdown(get_csv_download_link(results_df), unsafe_allow_html=True) # Display individual results for i, result in enumerate(st.session_state.results): with st.expander(f"#{i+1}: {result['filename']} (Score: {result['score']:.4f})"): col1, col2 = st.columns([1, 1]) with col1: st.subheader("Scores") st.write(f"Total Score: {result['score']:.4f}") st.write(f"Semantic Score: {result['semantic_score']:.4f}") st.write(f"Keyword Score: {result['keyword_score']:.4f}") st.subheader("Matched Skills") if result["skills"]: for skill in result["skills"]: st.write(f"• {skill}") else: st.write("No specific skills matched.") with col2: st.subheader("Explanation") st.write(result["explanation"]) st.subheader("Key Matches") for phrase in result["matched_phrases"]: st.markdown(f"• {phrase}") st.subheader("Resume Preview") st.text_area("", result["text_preview"], height=150, disabled=True) # Visualization of scores st.subheader("Score Comparison") # Prepare data for visualization chart_data = pd.DataFrame({ "Resume": [result["filename"] for result in st.session_state.results], "Semantic Score": [result["semantic_score"] for result in st.session_state.results], "Keyword Score": [result["keyword_score"] for result in st.session_state.results], "Total Score": [result["score"] for result in st.session_state.results] }) # Display as a bar chart st.bar_chart(chart_data.set_index("Resume")[["Total Score", "Semantic Score", "Keyword Score"]]) # Footer st.markdown("---") st.markdown("Built with Streamlit and Hugging Face models (NV-Embed-v2 and QwQ-32B)")