Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

App Files Files Community

root commited on May 22

Commit

6ff5e82

1 Parent(s): 2e8072e

ss

Browse files

Files changed (5) hide show

alt_models.py +0 -159
app.py +497 -932
explanation_generator.py +0 -223
fix_dependencies.py +0 -76
requirements.txt +8 -11

alt_models.py DELETED Viewed

@@ -1,159 +0,0 @@
-"""
-Alternative model loading implementation without sys.modules patching
-"""
-import torch
-from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-def count_gpus():
-    """Count the number of available GPUs"""
-    if torch.cuda.is_available():
-        return torch.cuda.device_count()
-    return 0
-def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
-    """Load the embedding model with a try-except approach instead of module patching"""
-    try:
-        print(f"Loading embedding model {model_name}...")
-        # Create a simple Replicate class that may be needed
-        class Replicate(torch.nn.Module):
-            def __init__(self, module, num_replicas=1):
-                super().__init__()
-                self.module = module
-                self.num_replicas = num_replicas
-            def forward(self, *args, **kwargs):
-                return self.module(*args, **kwargs)
-        # Get number of GPUs
-        num_gpus = count_gpus()
-        print(f"Found {num_gpus} GPUs")
-        # Choose device map strategy based on GPU count
-        if num_gpus > 1:
-            # For multi-GPU setup, use balanced distribution
-            device_map = "balanced"
-            print(f"Using balanced device mapping across {num_gpus} GPUs")
-        else:
-            # For single GPU, use auto or specific mapping based on memory
-            device_map = "auto"
-            print("Using automatic device mapping")
-        # Try the standard loading approach
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        model = AutoModel.from_pretrained(
-            model_name,
-            trust_remote_code=True,
-            device_map=device_map
-        )
-        print(f"Successfully loaded {model_name}")
-        return model, tokenizer
-    except Exception as e:
-        # If the first approach fails, try with module.__dict__
-        try:
-            print(f"First loading approach failed: {str(e)}")
-            print("Trying alternative loading approach...")
-            # Import the module
-            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-            # Dynamically get the module
-            model_class = AutoModel._MODEL_MAPPING[AutoModel._model_mapping[model_name]]
-            # Add Replicate to the module's namespace
-            model_class.__module_dict__ = {}
-            model_class.__module_dict__["Replicate"] = Replicate
-            # Get number of GPUs
-            num_gpus = count_gpus()
-            # Choose device map strategy based on GPU count
-            if num_gpus > 1:
-                device_map = "balanced"
-            else:
-                device_map = "auto"
-            # Try loading with the augmented namespace
-            model = model_class.from_pretrained(
-                model_name,
-                trust_remote_code=True,
-                device_map=device_map
-            )
-            print(f"Successfully loaded {model_name} with alternative approach")
-            return model, tokenizer
-        except Exception as e2:
-            print(f"Alternative loading approach also failed: {str(e2)}")
-            print(f"Could not load embedding model {model_name}")
-            return None, None
-def load_explanation_model(model_name="Qwen/QwQ-32B"):
-    """Load the explanation model with a try-except approach instead of module patching"""
-    try:
-        print(f"Loading explanation model {model_name}...")
-        # Get number of GPUs
-        num_gpus = count_gpus()
-        print(f"Found {num_gpus} GPUs")
-        # Choose quantization and device strategy based on GPU count and memory
-        if num_gpus > 1:
-            # For multi-GPU, use 4-bit quantization and balanced distribution
-            quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True
-            )
-            device_map = "balanced"
-            print(f"Using 4-bit quantization with balanced device mapping across {num_gpus} GPUs")
-        else:
-            # For single GPU, use more aggressive 4-bit quantization
-            quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True
-            )
-            device_map = "auto"
-            print("Using 4-bit quantization with automatic device mapping")
-        # Create a simple Replicate class that may be needed
-        class Replicate(torch.nn.Module):
-            def __init__(self, module, num_replicas=1):
-                super().__init__()
-                self.module = module
-                self.num_replicas = num_replicas
-            def forward(self, *args, **kwargs):
-                return self.module(*args, **kwargs)
-        # Try the standard loading approach
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        # Check if we have enough resources to load the model
-        if torch.cuda.is_available():
-            total_gpu_memory = sum([torch.cuda.get_device_properties(i).total_memory for i in range(num_gpus)]) / (1024**3)
-            if num_gpus > 1 or total_gpu_memory >= 16:  # 16 GB (reduced thanks to quantization)
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    quantization_config=quantization_config,
-                    device_map=device_map,
-                    trust_remote_code=True,
-                    torch_dtype=torch.float16,
-                    max_memory={i: f"{int(torch.cuda.get_device_properties(i).total_memory / (1024**3) * 0.9)}GiB" for i in range(num_gpus)}
-                )
-                print(f"Successfully loaded {model_name}")
-                return model, tokenizer
-            else:
-                print("Not enough GPU memory, using template-based explanations")
-                return None, tokenizer
-        else:
-            print("CUDA not available, using template-based explanations")
-            return None, tokenizer
-    except Exception as e:
-        print(f"Error loading explanation model: {str(e)}")
-        print("Falling back to template-based explanations.")
-        return None, None

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import streamlit as st
-import pdfplumber
 import pandas as pd
 import numpy as np
 import torch
@@ -8,59 +7,18 @@ import faiss
 import os
 import tempfile
 import base64
 from rank_bm25 import BM25Okapi
-from transformers import AutoModel, AutoTokenizer
-from sentence_transformers import SentenceTransformer
 from nltk.tokenize import word_tokenize, sent_tokenize
 from tqdm import tqdm
-import re
-import io
 import PyPDF2
 from docx import Document
 import csv
-import sys
-# Use the alternative model loading approach
-try:
-    # Try to import the functions from alt_models.py
-    from alt_models import load_embedding_model, load_explanation_model
-    USE_ALT_MODELS = True
-except ImportError:
-    USE_ALT_MODELS = False
-    # If import fails, we'll use the original approach
-    # Add Replicate class workaround
-    class Replicate(torch.nn.Module):
-        """Workaround class for missing Replicate in NV-Embed and Qwen models"""
-        def __init__(self, module, num_replicas=1):
-            super().__init__()
-            self.module = module
-            self.num_replicas = num_replicas
-        def forward(self, *args, **kwargs):
-            return self.module(*args, **kwargs)
-    # Create module structure if it doesn't exist yet
-    # Handle NVIDIA module
-    if "transformers.models.nvembed.modeling_nvembed" not in sys.modules:
-        # Create parent modules if they don't exist
-        if "transformers.models.nvembed" not in sys.modules:
-            sys.modules["transformers.models.nvembed"] = type('', (), {})
-        # Create the module we need
-        sys.modules["transformers.models.nvembed.modeling_nvembed"] = type('', (), {})
-    # Handle Qwen module
-    if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
-        # Create parent modules if they don't exist
-        if "transformers.models.qwen2" not in sys.modules:
-            sys.modules["transformers.models.qwen2"] = type('', (), {})
-        # Create the module we need
-        sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
-    # Add the class to modules
-    sys.modules["transformers.models.nvembed.modeling_nvembed"].Replicate = Replicate
-    sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
-from explanation_generator import ExplanationGenerator
 # Download NLTK resources
 try:
@@ -68,49 +26,17 @@ try:
 except LookupError:
     nltk.download('punkt')
-# Initialize embedding model at startup
-EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
-if USE_ALT_MODELS:
-    # Use the alternative loading approach
-    global_embedding_model, global_embedding_tokenizer = load_embedding_model(EMBEDDING_MODEL_NAME)
-else:
-    # Use the original approach
-    print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
-    try:
-        # Load embedding model and tokenizer
-        global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
-        global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
-        print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
-    except Exception as e:
-        print(f"Error loading embedding model: {str(e)}")
-        global_embedding_tokenizer = None
-        global_embedding_model = None
 # Set page configuration
 st.set_page_config(
-    page_title="Resume Screener & Skill Extractor",
-    page_icon="📄",
     layout="wide",
     initial_sidebar_state="expanded"
 )
-# Sidebar for model selection and weights
 with st.sidebar:
-    st.title("Configuration")
-    # Model selection
-    embedding_model_name = st.selectbox(
-        "Embedding Model",
-        ["nvidia/NV-Embed-v2"],
-        index=0
-    )
-    explanation_model_name = st.selectbox(
-        "Explanation Model",
-        ["Qwen/Qwen3-14B"],
-        index=0
-    )
     # Ranking weights
     st.subheader("Ranking Weights")
@@ -120,304 +46,202 @@ with st.sidebar:
     # Advanced options
     st.subheader("Advanced Options")
-    top_k = st.number_input("Number of results to display", min_value=1, max_value=20, value=10, step=1)
-    use_explanation = st.checkbox("Generate Explanations", value=True)
-    use_faiss = st.checkbox("Use FAISS for fast search", value=True)
-    # Memory optimization options
-    st.subheader("Memory Optimization")
-    memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
-    clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
-    gc_collect_interval = st.number_input(
-        "Garbage collection interval (files)",
-        min_value=10,
-        max_value=1000,
-        value=100,
-        step=10,
-        help="Run garbage collection after processing this many files"
-    )
     st.markdown("---")
-    st.markdown("### About")
-    st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
-# Initialize session state variables
-if 'resumes_uploaded' not in st.session_state:
-    st.session_state.resumes_uploaded = False
-if 'job_description' not in st.session_state:
-    st.session_state.job_description = ""
 if 'results' not in st.session_state:
     st.session_state.results = []
-if 'embedding_model' not in st.session_state:
-    st.session_state.embedding_model = global_embedding_model
-if 'tokenizer' not in st.session_state:
-    st.session_state.tokenizer = global_embedding_tokenizer
-if 'faiss_index' not in st.session_state:
-    st.session_state.faiss_index = None
-if 'explanation_generator' not in st.session_state:
-    st.session_state.explanation_generator = None
-class ResumeScreener:
-    def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/Qwen3-14B"):
-        """Initialize the ResumeScreener with the specified embedding model"""
-        self.embedding_model_name = embedding_model_name
-        self.explanation_model_name = explanation_model_name
-        # Initialize with preloaded models
-        self.model = st.session_state.embedding_model
-        self.tokenizer = st.session_state.tokenizer
-        self.faiss_index = None
-        self.embedding_size = None
-        self.explanation_generator = None
-        # Initialize explanation generator
-        if use_explanation and st.session_state.explanation_generator is None:
-            with st.spinner("Initializing explanation generator..."):
-                st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
-                self.explanation_generator = st.session_state.explanation_generator
-        elif use_explanation:
-            self.explanation_generator = st.session_state.explanation_generator
-    def extract_text_from_file(self, file, file_type):
         """Extract text from various file types"""
         try:
             if file_type == "pdf":
-                # Use pdfplumber for better text extraction
-                with pdfplumber.open(file) as pdf:
-                    text = ""
-                    for page in pdf.pages:
-                        text += page.extract_text() or ""
-                    # If pdfplumber fails, try PyPDF2 as fallback
-                    if not text.strip():
-                        reader = PyPDF2.PdfReader(file)
                         text = ""
-                        for page_num in range(len(reader.pages)):
-                            page = reader.pages[page_num]
                             text += page.extract_text() or ""
-                    return text
             elif file_type == "docx":
-                doc = Document(file)
                 return " ".join([paragraph.text for paragraph in doc.paragraphs])
             elif file_type == "txt":
-                return file.read().decode("utf-8")
             elif file_type == "csv":
-                csv_text = ""
-                csv_reader = csv.reader(io.StringIO(file.read().decode("utf-8")))
-                for row in csv_reader:
-                    csv_text += " ".join(row) + " "
-                return csv_text
-            else:
-                st.error(f"Unsupported file type: {file_type}")
-                return ""
         except Exception as e:
-            st.error(f"Error extracting text from file: {str(e)}")
             return ""
     def get_embedding(self, text):
-        """Generate text embedding for a given text"""
-        if self.model is None:
-            st.error("Embedding model not available. Please check your environment.")
-            return np.zeros(768)  # Default embedding size as fallback
         try:
-            # For HuggingFace models
-            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
-            # Move inputs to same device as model
-            device = next(self.model.parameters()).device
             inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
-                outputs = self.model(**inputs)
-            # Handle specific case for NV-Embed-v2 which returns a nested structure
-            if self.embedding_model_name == "nvidia/NV-Embed-v2":
-                # Access the embedding from the NV-Embed specific output format
-                if hasattr(outputs, "pooler_output"):
-                    embeddings = outputs.pooler_output
-                    embedding_np = embeddings.cpu().detach().numpy()
-                    if self.embedding_size is None:
-                        self.embedding_size = embedding_np.shape[1]
-                    return embedding_np[0]  # Return the first embedding
-                # Try to handle multi-level dictionary if the model changed output format
-                elif isinstance(outputs, dict) and "embedding" in outputs:
-                    embeddings = outputs["embedding"]
-                    embedding_np = embeddings.cpu().detach().numpy()
-                    if self.embedding_size is None:
-                        self.embedding_size = embedding_np.shape[1]
-                    return embedding_np[0]
-            # Handle different output structures
-            if hasattr(outputs, "last_hidden_state"):
-                # Mean pooling across token dimension
-                embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
-                embedding_np = embeddings.cpu().detach().numpy()
-                # Set embedding size if not set
-                if self.embedding_size is None:
-                    self.embedding_size = embedding_np.shape[0]
-                return embedding_np
-            elif isinstance(outputs, dict) and "embeddings" in outputs:
-                # For models that return a dictionary with embeddings
-                embeddings = outputs["embeddings"]
-                embedding_np = embeddings.cpu().detach().numpy()
-                # Set embedding size if not set
-                if self.embedding_size is None:
-                    self.embedding_size = embedding_np.shape[1]  # Use correct dimension
-                return embedding_np[0]  # Return the first embedding
-            elif isinstance(outputs, torch.Tensor):
-                # For models that return a tensor directly
-                embedding_np = outputs.cpu().detach().numpy()
-                # Set embedding size if not set
-                if self.embedding_size is None:
-                    self.embedding_size = embedding_np.shape[-1]
-                return embedding_np.squeeze()
             else:
-                # If we can't determine the output structure, try to inspect it for debugging
-                st.warning(f"Unexpected output structure from model: {type(outputs)}")
-                if hasattr(outputs, "__dict__"):
-                    for attr_name in dir(outputs):
-                        if not attr_name.startswith('_'):
-                            attr = getattr(outputs, attr_name)
-                            if isinstance(attr, torch.Tensor):
-                                st.info(f"Found tensor attribute '{attr_name}' with shape {attr.shape}")
-                                embedding_np = attr.cpu().detach().numpy()
-                                if self.embedding_size is None:
-                                    self.embedding_size = embedding_np.shape[-1]
-                                return embedding_np.squeeze()
-                # Last resort: return zeros
-                if self.embedding_size is None:
-                    self.embedding_size = 768  # Default size
-                return np.zeros(self.embedding_size)
         except Exception as e:
             st.error(f"Error generating embedding: {str(e)}")
-            if self.embedding_size is None:
-                self.embedding_size = 768  # Default size
-            return np.zeros(self.embedding_size)
-    def create_faiss_index(self, embeddings):
-        """Create a FAISS index for fast similarity search"""
-        # Get the dimension of the embeddings
-        dimension = embeddings[0].shape[0]
-        # Create a FAISS index
-        index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity with normalized vectors
-        # Add normalized vectors to the index
-        embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings])
-        index.add(embeddings_normalized)
-        return index
-    def query_faiss_index(self, index, query_embedding, k=10):
-        """Query the FAISS index with a query embedding"""
-        # Normalize query embedding
-        query_embedding = query_embedding / np.linalg.norm(query_embedding)
-        # Reshape to a row vector if needed
-        if len(query_embedding.shape) == 1:
-            query_embedding = query_embedding.reshape(1, -1)
-        # Query the index
-        scores, indices = index.search(query_embedding, k)
-        return scores[0], indices[0]  # Return the scores and indices as flat arrays
     def calculate_bm25_scores(self, resume_texts, job_description):
         """Calculate BM25 scores for keyword matching"""
-        # Tokenize job description
-        job_tokens = word_tokenize(job_description.lower())
-        # Prepare corpus from resumes
-        corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
-        # Check if corpus is empty
-        if not corpus or len(corpus) == 0:
-            st.error("No resume texts provided for BM25 calculation")
-            return [0.0] * len(resume_texts)
-        # Check for empty documents in corpus
-        filtered_corpus = [doc for doc in corpus if len(doc) > 0]
-        if not filtered_corpus:
-            st.error("All resume texts are empty after tokenization")
-            return [0.0] * len(resume_texts)
-        # Initialize BM25
         try:
-            bm25 = BM25Okapi(filtered_corpus)
-        # Calculate scores
-        scores = bm25.get_scores(job_tokens)
-            # If we filtered out empty documents, we need to reconstruct the scores array
-            if len(filtered_corpus) != len(corpus):
-                full_scores = []
-                filtered_idx = 0
-                for i in range(len(corpus)):
-                    if len(corpus[i]) > 0:
-                        full_scores.append(scores[filtered_idx])
-                        filtered_idx += 1
-                    else:
-                        full_scores.append(0.0)
-                return full_scores
-            else:
-        return scores
         except Exception as e:
-            st.error(f"Error in BM25 calculation: {str(e)}")
             return [0.0] * len(resume_texts)
-    def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
-        """Calculate hybrid scores combining semantic similarity and BM25"""
-        # Calculate semantic similarity scores (cosine similarity)
-        if use_faiss and len(resume_embeddings) > 10:
-            # Create FAISS index if not already created
-            if st.session_state.faiss_index is None:
-                index = self.create_faiss_index(resume_embeddings)
-                st.session_state.faiss_index = index
-            else:
-                index = st.session_state.faiss_index
-            # Query index with job embedding
-            faiss_scores, faiss_indices = self.query_faiss_index(index, job_embedding, k=len(resume_embeddings))
-            # Create full semantic scores array
-            semantic_scores = np.zeros(len(resume_embeddings))
-            for i, idx in enumerate(faiss_indices):
-                if idx < len(resume_embeddings):
-                    semantic_scores[idx] = faiss_scores[i]
-        else:
-            # Direct cosine similarity calculation for smaller datasets
-            semantic_scores = []
-            for emb in resume_embeddings:
-                # Normalize the embeddings for cosine similarity
-                emb_norm = emb / np.linalg.norm(emb)
-                job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
-                # Calculate cosine similarity
-                similarity = np.dot(emb_norm, job_emb_norm)
-                semantic_scores.append(similarity)
         # Calculate BM25 scores
         bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
         # Normalize BM25 scores
-        if max(bm25_scores) > 0:
-            bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
         # Calculate hybrid scores
-        keyword_weight = 1.0 - semantic_weight
         hybrid_scores = [
             (semantic_weight * sem_score) + (keyword_weight * bm25_score)
             for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
@@ -426,682 +250,423 @@ class ResumeScreener:
         return hybrid_scores, semantic_scores, bm25_scores
     def extract_skills(self, text, job_description):
-        """Extract skills from text based on job description"""
-        # Simple skill extraction using regex and job description keywords
-        # In a real implementation, this could be enhanced with ML-based skill extraction
-        # Extract potential skills from job description (words 3 letters or longer)
-        potential_skills = set()
-        # Common skill-related phrases that might appear in job descriptions
-        skill_indicators = ["experience with", "knowledge of", "familiar with", "proficient in",
-                           "skills in", "expertise in", "background in", "capabilities in",
-                           "years of experience in", "understanding of", "trained in"]
-        # Extract skills from sentences containing skill indicators
-        sentences = sent_tokenize(job_description)
-        for sentence in sentences:
-            sentence_lower = sentence.lower()
-            for indicator in skill_indicators:
-                if indicator in sentence_lower:
-                    # Extract words after the indicator, possibly until end of sentence or punctuation
-                    skills_part = sentence_lower.split(indicator, 1)[1]
-                    # Extract words, cleaning up symbols
-                    words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part)
-                    for word in words:
-                        if len(word) >= 3:  # Only consider words 3 letters or longer
-                            potential_skills.add(word.lower())
-        # Add explicit skills - look for comma-separated lists or bullet points
-        skill_lists = re.findall(r'(?:skills|requirements|qualifications)[^\n.]*?:(.+?)(?:\n|$)', job_description.lower())
-        for skill_list in skill_lists:
-            words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list)
-            for word in words:
-                if len(word) >= 3:
-                    potential_skills.add(word.lower())
-        # Add common tech skills if they appear in the job description
-        common_tech_skills = ["python", "java", "c++", "javascript", "sql", "react", "node.js", "typescript",
-                              "html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "terraform",
-                              "git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"]
-        for skill in common_tech_skills:
-            if skill in job_description.lower():
-                potential_skills.add(skill)
-        # Find skills in the resume
-        matched_skills = []
-        for skill in potential_skills:
-            # Make it a word boundary search with regex
-            pattern = r'\b' + re.escape(skill) + r'\b'
-            matches = re.findall(pattern, text.lower())
-            if matches:
-                matched_skills.append(skill)
-        return list(set(matched_skills))
-    def extract_key_phrases(self, text, job_description):
-        """Extract key phrases from text that match job description keywords"""
-        # Identify job skills first
-        skills = self.extract_skills(job_description, job_description)
-        # Extract sentences that contain skills
-        sentences = sent_tokenize(text)
-        skill_sentences = []
-        for sentence in sentences:
-            sentence_lower = sentence.lower()
-            for skill in skills:
-                if skill in sentence_lower:
-                    # Append the sentence with the skill highlighted
-                    highlighted = sentence.replace(skill, f"**{skill}**")
-                    skill_sentences.append(highlighted)
-                    break
-        # Get additional generic matches if we don't have enough skill sentences
-        if len(skill_sentences) < 5:
-            # Simple extraction based on job description keywords
-            job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3)
-            text_tokens = word_tokenize(text)
-            matches = []
-            for i, token in enumerate(text_tokens):
-                if token.lower() in job_tokens:
-                    # Get a phrase context (5 words before and after)
-                    start = max(0, i - 5)
-                    end = min(len(text_tokens), i + 6)
-                    phrase = " ".join(text_tokens[start:end])
-                    matches.append(phrase)
-            # Add unique phrases to complement skill sentences
-            unique_matches = list(set(matches))
-            skill_sentences.extend(unique_matches[:5 - len(skill_sentences)])
-        # Return unique phrases, up to 5
-        return skill_sentences[:5]
     def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
-        """Generate explanation for why a resume was ranked highly using QwQ-32B model"""
-        # Use the explanation generator if available
-        if use_explanation and self.explanation_generator:
-            return self.explanation_generator.generate_explanation(
-                resume_text,
-                job_description,
-                score,
-                semantic_score,
-                bm25_score,
-                skills
             )
-        else:
-            # Fallback to simple explanation
-            matching_phrases = self.extract_key_phrases(resume_text, job_description)
-            explanation = f"This resume received a score of {score:.2f}, with semantic relevance of {semantic_score:.2f} and keyword match of {bm25_score:.2f}. "
-            if skills:
-                explanation += f"The resume shows experience with key skills: {', '.join(skills[:5])}. "
-            if matching_phrases:
-                explanation += f"Key matching elements include: {matching_phrases[0]}"
-            return explanation
-# Function to create a download link for dataframe as CSV
-def get_csv_download_link(df, filename="results.csv"):
     csv = df.to_csv(index=False)
     b64 = base64.b64encode(csv.encode()).decode()
-    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
-    return href
-# Add this new function after the get_csv_download_link function
-def get_huggingface_spaces_datasets():
-    """Check for datasets in Hugging Face Spaces environment"""
-    datasets = []
-    # Common dataset paths in Hugging Face Spaces
-    potential_paths = [
-        "/data",                 # Common mount point
-        "data",                  # Relative path
-        os.path.expanduser("~/data"),  # Home directory
-    ]
-    for path in potential_paths:
-        if os.path.exists(path) and os.path.isdir(path):
-            # Look for CSV files
-            csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
-            for csv_file in csv_files:
-                datasets.append(os.path.join(path, csv_file))
-            # Look for directories that might contain PDFs
-            for subdir in os.listdir(path):
-                subdir_path = os.path.join(path, subdir)
-                if os.path.isdir(subdir_path):
-                    pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
-                    if pdf_count > 0:
-                        datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
-    return datasets
-# Main app UI
-st.title("Resume Screener & Skill Extractor")
 st.markdown("---")
-# Initialize the resume screener
-screener = ResumeScreener(embedding_model_name, explanation_model_name)
-# Job description input
-st.header("1. Enter Job Description")
 job_description = st.text_area(
-    "Paste the job description or requirements here:",
-    height=200,
-    help="Enter the complete job description or a list of required skills and qualifications."
 )
-# Resume upload
-st.header("2. Upload Resumes")
-upload_option = st.radio(
-    "Choose upload method:",
-    ["Upload Files", "Upload from Dataset", "Process Directory"]
 )
-uploaded_files = []
 resume_texts = []
 file_names = []
-if upload_option == "Upload Files":
     uploaded_files = st.file_uploader(
         "Upload resume files",
-        type=["pdf", "docx", "txt", "csv"],
         accept_multiple_files=True,
-        help="Upload multiple resume files in PDF, DOCX, TXT, or CSV format."
     )
     if uploaded_files:
-        with st.spinner("Processing resumes..."):
             for file in uploaded_files:
                 file_type = file.name.split('.')[-1].lower()
                 with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
                     tmp_file.write(file.getvalue())
                     tmp_path = tmp_file.name
                 text = screener.extract_text_from_file(tmp_path, file_type)
-                if text:
                     resume_texts.append(text)
                     file_names.append(file.name)
-                # Clean up temp file
                 os.unlink(tmp_path)
-        st.session_state.resumes_uploaded = True
-        st.success(f"Successfully processed {len(resume_texts)} resumes.")
-elif upload_option == "Process Directory":
-    st.write("Process resume files from a directory on the server.")
-    # Input for directory path
-    resume_dir = st.text_input(
-        "Enter the path to the directory containing resume files:",
-        help="For Hugging Face Spaces, this could be a mounted directory or dataset."
-    )
-    # Limit batch size
-    batch_size = st.number_input(
-        "Number of files to process per batch (lower for less memory usage):",
-        min_value=10,
-        max_value=1000,
-        value=100,
-        step=10
-    )
-    # File types to process
-    file_types = st.multiselect(
-        "Select file types to process:",
-        ["pdf", "docx", "txt", "csv"],
-        default=["pdf"]
-    )
-    if resume_dir and st.button("Process Directory"):
-        if os.path.isdir(resume_dir):
-            # Get all files matching the selected types
-            all_files = []
-            for file_type in file_types:
-                all_files.extend([
-                    os.path.join(resume_dir, f)
-                    for f in os.listdir(resume_dir)
-                    if f.lower().endswith(f'.{file_type}')
-                ])
-            if all_files:
-                total_files = len(all_files)
-                st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")
-                # Process in batches
-                processed_count = 0
-                progress_bar = st.progress(0)
-                status_text = st.empty()
-                for i in range(0, total_files, batch_size):
-                    batch_files = all_files[i:i+batch_size]
-                    for j, file_path in enumerate(batch_files):
-                        try:
-                            file_type = file_path.split('.')[-1].lower()
-                            text = screener.extract_text_from_file(file_path, file_type)
-                            if text:
-                                resume_texts.append(text)
-                                file_names.append(os.path.basename(file_path))
-                                processed_count += 1
-                                # Apply memory optimization if enabled
-                                if memory_optimization and j % gc_collect_interval == 0 and j > 0:
-                                    import gc
-                                    gc.collect()
-                                    status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
-                        except Exception as e:
-                            st.warning(f"Error processing {file_path}: {str(e)}")
-                    # Update progress
-                    progress = min(1.0, (i + len(batch_files)) / total_files)
-                    progress_bar.progress(progress)
-                    status_text.text(f"Processed {processed_count}/{total_files} files...")
-                    # Run garbage collection between batches if memory optimization is enabled
-                    if memory_optimization:
-                        import gc
-                        gc.collect()
-                # Final garbage collection if memory optimization is enabled
-                if memory_optimization:
-                    import gc
-                    gc.collect()
-                st.session_state.resumes_uploaded = True
-                st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
-else:
-                st.error(f"No matching files found in {resume_dir}")
-        else:
-            st.error(f"Directory {resume_dir} does not exist or is not accessible.")
-elif upload_option == "Upload from Dataset":
-    # Upload from Dataset implementation
-    st.write("Upload a CSV file containing resume data or load from available datasets.")
-    # Check for available datasets in Hugging Face Spaces
-    hf_datasets = get_huggingface_spaces_datasets()
-    if hf_datasets:
-        st.subheader("Available Datasets in Hugging Face Spaces")
-        dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
-        selected_dataset = st.selectbox("Select a dataset:", dataset_options)
-        if selected_dataset != "None":
-            selected_index = dataset_options.index(selected_dataset) - 1  # Adjust for "None"
-            dataset_path = hf_datasets[selected_index]
-            if isinstance(dataset_path, tuple):
-                # It's a PDF directory
-                pdf_dir = dataset_path[0]
-                st.write(f"Selected PDF directory: {pdf_dir}")
-                batch_size = st.number_input(
-                    "Number of files to process per batch:",
-                    min_value=10,
-                    max_value=1000,
-                    value=100,
-                    step=10
-                )
-                if st.button("Process PDF Directory"):
-                    # Use the same processing logic as in the "Process Directory" option
-                    if os.path.isdir(pdf_dir):
-                        all_files = [
-                            os.path.join(pdf_dir, f)
-                            for f in os.listdir(pdf_dir)
-                            if f.lower().endswith('.pdf')
-                        ]
-                        if all_files:
-                            total_files = len(all_files)
-                            st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
-                            # Process in batches
-                            processed_count = 0
-                            progress_bar = st.progress(0)
-                            status_text = st.empty()
-                            for i in range(0, total_files, batch_size):
-                                batch_files = all_files[i:i+batch_size]
-                                for j, file_path in enumerate(batch_files):
-                                    try:
-                                        text = screener.extract_text_from_file(file_path, "pdf")
-                                        if text:
-                                            resume_texts.append(text)
-                                            file_names.append(os.path.basename(file_path))
-                                            processed_count += 1
-                                            # Apply memory optimization if enabled
-                                            if memory_optimization and j % gc_collect_interval == 0 and j > 0:
-                                                import gc
-                                                gc.collect()
-                                    except Exception as e:
-                                        st.warning(f"Error processing {file_path}: {str(e)}")
-                                # Update progress
-                                progress = min(1.0, (i + len(batch_files)) / total_files)
-                                progress_bar.progress(progress)
-                                status_text.text(f"Processed {processed_count}/{total_files} files...")
-                                # Memory optimization
-                                if memory_optimization:
-                                    import gc
-                                    gc.collect()
-                            st.session_state.resumes_uploaded = True
-                            st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
-            else:
-                # It's a CSV file
-                st.write(f"Selected CSV dataset: {dataset_path}")
-                try:
-                    # Read the CSV file
-                    df = pd.read_csv(dataset_path)
-                    # Let user select which column contains the resume text
-                    text_column = st.selectbox(
-                        "Select column containing resume text:",
-                        df.columns.tolist()
-                    )
-                    if st.button("Process Selected CSV"):
-                        # Extract text from the selected column
-                        for i, row in df.iterrows():
-                            text = str(row[text_column])
-                            if text and not pd.isna(text):
-                                resume_texts.append(text)
-                                # Use index as filename if no filename column
-                                file_name = f"resume_{i}.txt"
-                                if 'filename' in df.columns:
-                                    file_name = row['filename']
-                                file_names.append(file_name)
-                        st.session_state.resumes_uploaded = True
-                        st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
-                except Exception as e:
-                    st.error(f"Error processing CSV: {str(e)}")
-    # Rest of the existing Upload from Dataset code
-    dataset_option = st.radio(
-        "Dataset source:",
-        ["Upload CSV", "Use Hugging Face Dataset"]
-    )
-    if dataset_option == "Upload CSV":
-        csv_file = st.file_uploader(
-            "Upload CSV file containing resume data",
-            type=["csv"],
-            help="CSV should contain at least a column with resume text."
         )
-        if csv_file:
-            with st.spinner("Processing CSV data..."):
-                # Read the CSV file
-                df = pd.read_csv(csv_file)
-                # Let user select which column contains the resume text
-                text_column = st.selectbox(
-                    "Select column containing resume text:",
-                    df.columns.tolist()
                 )
-                if st.button("Process Dataset"):
-                    # Extract text from the selected column
-                    for i, row in df.iterrows():
-                        text = str(row[text_column])
-                        if text and not pd.isna(text):
-                            resume_texts.append(text)
-                            # Use index as filename if no filename column
-                            file_name = f"resume_{i}.txt"
-                            if 'filename' in df.columns:
-                                file_name = row['filename']
-                            file_names.append(file_name)
-                    st.session_state.resumes_uploaded = True
-                    st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
-    else:
-        # Hugging Face Dataset option
-        dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
-        split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
-        if dataset_name and st.button("Load Dataset"):
-            with st.spinner(f"Loading dataset {dataset_name}..."):
-                try:
-                    from datasets import load_dataset
-                    # Load the dataset
-                    dataset = load_dataset(dataset_name, split=split)
-                    # Display dataset info
-                    st.write(f"Dataset loaded with {len(dataset)} entries.")
-                    # Let user select which column contains the resume text
-                    if len(dataset.column_names) > 0:
-                        text_column = st.selectbox(
-                            "Select column containing resume text:",
-                            dataset.column_names
-                        )
-                        if st.button("Process Hugging Face Dataset"):
-                            # Extract text from the selected column
-                            for i, item in enumerate(dataset):
-                                if text_column in item:
-                                    text = str(item[text_column])
-                                    if text:
-                                        resume_texts.append(text)
-                                        # Use index or id field as filename
-                                        file_name = f"resume_{i}.txt"
-                                        if 'id' in item:
-                                            file_name = f"resume_{item['id']}.txt"
-                                        file_names.append(file_name)
-                            st.session_state.resumes_uploaded = True
-                            st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
-                except Exception as e:
-                    st.error(f"Error loading dataset: {str(e)}")
-                    st.info("Make sure you have the 'datasets' library installed: pip install datasets")
-# Process button
-if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
-    with st.spinner("Processing job description and resumes..."):
-        # Check if we have too many resumes to process at once
-        large_dataset = len(resume_texts) > 1000
-        # Get job description embedding
-        job_embedding = screener.get_embedding(job_description)
-        # For large datasets, we need to process in batches
-        if large_dataset:
-            st.info(f"Processing {len(resume_texts)} resumes in batches to manage memory usage")
-            # Process in batches of 500 resumes
-            batch_size = 500
-            all_hybrid_scores = []
-            all_semantic_scores = []
-            all_bm25_scores = []
-            # Calculate BM25 scores first (doesn't require GPU)
-            bm25_scores = screener.calculate_bm25_scores(resume_texts, job_description)
-            # Process embeddings in batches
-            progress_bar = st.progress(0)
-            for i in range(0, len(resume_texts), batch_size):
-                batch_end = min(i + batch_size, len(resume_texts))
-                batch_texts = resume_texts[i:batch_end]
-                st.info(f"Processing batch {i//batch_size + 1}/{(len(resume_texts) + batch_size - 1)//batch_size} " +
-                        f"(resumes {i+1}-{batch_end})")
-                # Get resume embeddings for this batch
-                batch_embeddings = []
-                for j, text in enumerate(batch_texts):
-                    embedding = screener.get_embedding(text)
-                    batch_embeddings.append(embedding)
-                    progress = (i + j + 1) / len(resume_texts)
-                    progress_bar.progress(progress)
-                # Calculate semantic scores for this batch
-                batch_semantic_scores = []
-                for emb in batch_embeddings:
-                    # Normalize the embeddings for cosine similarity
-                    emb_norm = emb / np.linalg.norm(emb)
-                    job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
-                    # Calculate cosine similarity
-                    similarity = np.dot(emb_norm, job_emb_norm)
-                    batch_semantic_scores.append(similarity)
-                # Store scores for this batch
-                all_semantic_scores.extend(batch_semantic_scores)
-                # Explicitly clear GPU memory after processing each batch
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-            # Calculate hybrid scores
-            semantic_scores = all_semantic_scores
-            keyword_weight = 1.0 - semantic_weight
-            # Normalize BM25 scores if they're not all zeros
-            if bm25_scores and max(bm25_scores) > 0:
-                bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
-            # Calculate final hybrid scores
-            hybrid_scores = [
-                (semantic_weight * sem_score) + (keyword_weight * bm25_score)
-                for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
-            ]
-        else:
-            # Regular processing for smaller datasets
-        # Get resume embeddings
-        resume_embeddings = []
-        progress_bar = st.progress(0)
-        for i, text in enumerate(resume_texts):
-            embedding = screener.get_embedding(text)
-            resume_embeddings.append(embedding)
-            progress_bar.progress((i + 1) / len(resume_texts))
-        # Calculate hybrid scores
-        hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
-            resume_texts,
-            resume_embeddings,
-            job_embedding,
-            semantic_weight,
-            use_faiss
-        )
-        # Get top candidates
-        combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
-        sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True)
-        top_candidates = sorted_data[:int(top_k)]
-        # Create results with explanations if enabled
-        results = []
-        for name, text, score, semantic_score, bm25_score in top_candidates:
-            # Extract skills for this resume
-            skills = screener.extract_skills(text, job_description)
-            result = {
-                "filename": name,
-                "score": score,
-                "semantic_score": semantic_score,
-                "keyword_score": bm25_score,
-                "text_preview": text[:500] + "...",
-                "matched_phrases": screener.extract_key_phrases(text, job_description),
-                "skills": skills
-            }
-            if use_explanation:
-                explanation = screener.generate_explanation(
-                    text,
-                    job_description,
-                    score,
-                    semantic_score,
-                    bm25_score,
-                    skills
-                )
-                result["explanation"] = explanation
-            else:
-                result["explanation"] = ""
-            results.append(result)
-        st.session_state.results = results
-        st.success(f"Found top {len(results)} candidates!")
-# Display results
 if st.session_state.results:
-    st.header("3. Results")
-    # Create a DataFrame for download
-    df_data = []
     for result in st.session_state.results:
-        df_data.append({
-            "Filename": result["filename"],
-            "Score": result["score"],
-            "Semantic Score": result["semantic_score"],
-            "Keyword Score": result["keyword_score"],
-            "Skills": ", ".join(result["skills"]),
-            "Explanation": result["explanation"]
         })
-    results_df = pd.DataFrame(df_data)
-    # Display download link
-    st.markdown(get_csv_download_link(results_df), unsafe_allow_html=True)
-    # Display individual results
-    for i, result in enumerate(st.session_state.results):
-        with st.expander(f"#{i+1}: {result['filename']} (Score: {result['score']:.4f})"):
-            col1, col2 = st.columns([1, 1])
             with col1:
-                st.subheader("Scores")
-                st.write(f"Total Score: {result['score']:.4f}")
-                st.write(f"Semantic Score: {result['semantic_score']:.4f}")
-                st.write(f"Keyword Score: {result['keyword_score']:.4f}")
-                st.subheader("Matched Skills")
-                if result["skills"]:
-                    for skill in result["skills"]:
-                        st.write(f"• {skill}")
-                else:
-                    st.write("No specific skills matched.")
             with col2:
-                st.subheader("Explanation")
-                st.write(result["explanation"])
-                st.subheader("Key Matches")
-                for phrase in result["matched_phrases"]:
-                    st.markdown(f"• {phrase}")
-                st.subheader("Resume Preview")
-                st.text_area("", result["text_preview"], height=150, disabled=True)
-    # Visualization of scores
-    st.subheader("Score Comparison")
-    # Prepare data for visualization
-    chart_data = pd.DataFrame({
-        "Resume": [result["filename"] for result in st.session_state.results],
-        "Semantic Score": [result["semantic_score"] for result in st.session_state.results],
-        "Keyword Score": [result["keyword_score"] for result in st.session_state.results],
-        "Total Score": [result["score"] for result in st.session_state.results]
-    })
-    # Display as a bar chart
-    st.bar_chart(chart_data.set_index("Resume")[["Total Score", "Semantic Score", "Keyword Score"]])
 # Footer
 st.markdown("---")
-st.markdown("Built with Streamlit and Hugging Face models (NV-Embed-v2 and Qwen3-14B)")

 import streamlit as st
 import pandas as pd
 import numpy as np
 import torch
 import os
 import tempfile
 import base64
+import re
+import io
 from rank_bm25 import BM25Okapi
+from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from nltk.tokenize import word_tokenize, sent_tokenize
 from tqdm import tqdm
+import pdfplumber
 import PyPDF2
 from docx import Document
 import csv
+from datasets import load_dataset
+import gc
 # Download NLTK resources
 try:
 except LookupError:
     nltk.download('punkt')
 # Set page configuration
 st.set_page_config(
+    page_title="AI Resume Screener",
+    page_icon="🎯",
     layout="wide",
     initial_sidebar_state="expanded"
 )
+# Sidebar configuration
 with st.sidebar:
+    st.title("⚙️ Configuration")
     # Ranking weights
     st.subheader("Ranking Weights")
     # Advanced options
     st.subheader("Advanced Options")
+    top_k = st.number_input("Number of results to display", min_value=1, max_value=50, value=10, step=1)
+    use_explanation = st.checkbox("Generate AI Explanations", value=True)
     st.markdown("---")
+    st.markdown("### 🤖 Models Used")
+    st.markdown("- **Embedding**: NVIDIA NV-Embed-v2")
+    st.markdown("- **Explanation**: Qwen3-14B (4-bit)")
+    st.markdown("### 📊 About")
+    st.markdown("This app uses hybrid ranking combining semantic similarity with keyword matching to find the best candidates for job positions.")
+# Initialize session state
+if 'embedding_model' not in st.session_state:
+    st.session_state.embedding_model = None
+if 'explanation_model' not in st.session_state:
+    st.session_state.explanation_model = None
 if 'results' not in st.session_state:
     st.session_state.results = []
+@st.cache_resource
+def load_embedding_model():
+    """Load and cache the embedding model"""
+    try:
+        with st.spinner("🔄 Loading NVIDIA NV-Embed-v2 model..."):
+            tokenizer = AutoTokenizer.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
+            model = AutoModel.from_pretrained(
+                "nvidia/NV-Embed-v2",
+                trust_remote_code=True,
+                device_map="auto",
+                torch_dtype=torch.float16
+            )
+            st.success("✅ Embedding model loaded successfully!")
+            return model, tokenizer
+    except Exception as e:
+        st.error(f"❌ Error loading embedding model: {str(e)}")
+        return None, None
+@st.cache_resource
+def load_explanation_model():
+    """Load and cache the explanation model with quantization"""
+    if not use_explanation:
+        return None, None
+    try:
+        with st.spinner("🔄 Loading Qwen3-14B model with 4-bit quantization..."):
+            # Configure 4-bit quantization
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                "Qwen/Qwen2.5-14B-Instruct",
+                quantization_config=quantization_config,
+                device_map="auto",
+                trust_remote_code=True,
+                torch_dtype=torch.float16
+            )
+            st.success("✅ Explanation model loaded successfully!")
+            return model, tokenizer
+    except Exception as e:
+        st.error(f"❌ Error loading explanation model: {str(e)}")
+        return None, None
+class ResumeScreener:
+    def __init__(self):
+        # Load models
+        self.embedding_model, self.embedding_tokenizer = load_embedding_model()
+        if use_explanation:
+            self.explanation_model, self.explanation_tokenizer = load_explanation_model()
+        else:
+            self.explanation_model, self.explanation_tokenizer = None, None
+    def extract_text_from_file(self, file_path, file_type):
         """Extract text from various file types"""
         try:
             if file_type == "pdf":
+                with open(file_path, 'rb') as file:
+                    with pdfplumber.open(file) as pdf:
                         text = ""
+                        for page in pdf.pages:
                             text += page.extract_text() or ""
+                        if not text.strip():
+                            # Fallback to PyPDF2
+                            file.seek(0)
+                            reader = PyPDF2.PdfReader(file)
+                            text = ""
+                            for page in reader.pages:
+                                text += page.extract_text() or ""
+                        return text
             elif file_type == "docx":
+                doc = Document(file_path)
                 return " ".join([paragraph.text for paragraph in doc.paragraphs])
             elif file_type == "txt":
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    return file.read()
             elif file_type == "csv":
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    csv_reader = csv.reader(file)
+                    return " ".join([" ".join(row) for row in csv_reader])
         except Exception as e:
+            st.error(f"Error extracting text from {file_path}: {str(e)}")
             return ""
     def get_embedding(self, text):
+        """Generate embedding for text"""
+        if self.embedding_model is None:
+            return np.zeros(4096)  # NV-Embed-v2 dimension
         try:
+            # Truncate text to avoid memory issues
+            text = text[:8192]  # Reasonable limit for NV-Embed-v2
+            inputs = self.embedding_tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512,
+                padding=True
+            )
+            # Move to same device as model
+            device = next(self.embedding_model.parameters()).device
             inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
+                outputs = self.embedding_model(**inputs)
+            # Extract embeddings - NV-Embed-v2 specific
+            if hasattr(outputs, 'pooler_output'):
+                embeddings = outputs.pooler_output
+            elif hasattr(outputs, 'last_hidden_state'):
+                embeddings = outputs.last_hidden_state.mean(dim=1)
             else:
+                embeddings = outputs[0].mean(dim=1)
+            return embeddings.cpu().numpy().squeeze()
         except Exception as e:
             st.error(f"Error generating embedding: {str(e)}")
+            return np.zeros(4096)
     def calculate_bm25_scores(self, resume_texts, job_description):
         """Calculate BM25 scores for keyword matching"""
         try:
+            job_tokens = word_tokenize(job_description.lower())
+            corpus = [word_tokenize(text.lower()) for text in resume_texts if text.strip()]
+            if not corpus:
+                return [0.0] * len(resume_texts)
+            bm25 = BM25Okapi(corpus)
+            scores = bm25.get_scores(job_tokens)
+            return scores.tolist()
         except Exception as e:
+            st.error(f"Error calculating BM25 scores: {str(e)}")
             return [0.0] * len(resume_texts)
+    def calculate_hybrid_scores(self, resume_texts, job_description):
+        """Calculate hybrid scores combining semantic and keyword matching"""
+        # Get job embedding
+        job_embedding = self.get_embedding(job_description)
+        # Get resume embeddings
+        resume_embeddings = []
+        progress_bar = st.progress(0)
+        for i, text in enumerate(resume_texts):
+            embedding = self.get_embedding(text)
+            resume_embeddings.append(embedding)
+            progress_bar.progress((i + 1) / len(resume_texts))
+        # Calculate semantic scores (cosine similarity)
+        semantic_scores = []
+        for resume_emb in resume_embeddings:
+            job_norm = job_embedding / (np.linalg.norm(job_embedding) + 1e-8)
+            resume_norm = resume_emb / (np.linalg.norm(resume_emb) + 1e-8)
+            similarity = np.dot(job_norm, resume_norm)
+            semantic_scores.append(float(similarity))
         # Calculate BM25 scores
         bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
         # Normalize BM25 scores
+        if bm25_scores and max(bm25_scores) > 0:
+            max_bm25 = max(bm25_scores)
+            bm25_scores = [score / max_bm25 for score in bm25_scores]
         # Calculate hybrid scores
         hybrid_scores = [
             (semantic_weight * sem_score) + (keyword_weight * bm25_score)
             for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
         return hybrid_scores, semantic_scores, bm25_scores
     def extract_skills(self, text, job_description):
+        """Extract skills from resume based on job description"""
+        # Common tech skills and job-related terms
+        common_skills = [
+            "python", "java", "javascript", "react", "node.js", "sql", "html", "css",
+            "aws", "azure", "docker", "kubernetes", "git", "agile", "scrum", "ci/cd",
+            "machine learning", "data science", "artificial intelligence", "tensorflow",
+            "pytorch", "pandas", "numpy", "scikit-learn", "mysql", "postgresql",
+            "mongodb", "redis", "elasticsearch", "spark", "hadoop", "tableau", "powerbi"
+        ]
+        # Extract skills from job description
+        job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
+        # Find matching skills
+        found_skills = []
+        text_lower = text.lower()
+        # Check common skills
+        for skill in common_skills:
+            if skill in text_lower and skill in " ".join(job_words):
+                found_skills.append(skill)
+        # Check job-specific terms
+        for word in job_words:
+            if len(word) > 3 and word in text_lower:
+                found_skills.append(word)
+        return list(set(found_skills))[:10]  # Return top 10 unique skills
     def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
+        """Generate explanation using Qwen model"""
+        if self.explanation_model is None or self.explanation_tokenizer is None:
+            return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
+        try:
+            # Create prompt
+            prompt = f"""As a recruitment AI assistant, explain why this resume scored {score:.2f} for the given job position.
+Job Requirements:
+{job_description[:500]}...
+Resume Summary:
+{resume_text[:800]}...
+Scores:
+- Overall: {score:.2f}/1.0
+- Semantic Match: {semantic_score:.2f}/1.0
+- Keyword Match: {bm25_score:.2f}/1.0
+- Key Skills: {', '.join(skills[:5])}
+Provide a concise 2-3 sentence explanation of the match quality and key strengths."""
+            # Generate response
+            messages = [{"role": "user", "content": prompt}]
+            text = self.explanation_tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
             )
+            inputs = self.explanation_tokenizer(text, return_tensors="pt").to(self.explanation_model.device)
+            with torch.no_grad():
+                outputs = self.explanation_model.generate(
+                    **inputs,
+                    max_new_tokens=150,
+                    temperature=0.7,
+                    do_sample=True,
+                    pad_token_id=self.explanation_tokenizer.eos_token_id
+                )
+            response = self.explanation_tokenizer.decode(
+                outputs[0][inputs.input_ids.shape[1]:],
+                skip_special_tokens=True
+            )
+            return response.strip()[:400]  # Limit length
+        except Exception as e:
+            st.warning(f"AI explanation failed: {str(e)}")
+            return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
+    def _generate_simple_explanation(self, score, semantic_score, bm25_score, skills):
+        """Fallback explanation generation"""
+        if score > 0.8:
+            quality = "excellent"
+        elif score > 0.6:
+            quality = "good"
+        elif score > 0.4:
+            quality = "moderate"
+        else:
+            quality = "limited"
+        explanation = f"This resume shows {quality} alignment with the job requirements (score: {score:.2f}). "
+        if semantic_score > bm25_score:
+            explanation += f"Strong conceptual match ({semantic_score:.2f}) with relevant experience. "
+        else:
+            explanation += f"Good keyword coverage ({bm25_score:.2f}) of job requirements. "
+        if skills:
+            explanation += f"Key matching skills: {', '.join(skills[:3])}."
+        return explanation
+def create_download_link(df, filename="resume_screening_results.csv"):
+    """Create download link for results"""
     csv = df.to_csv(index=False)
     b64 = base64.b64encode(csv.encode()).decode()
+    return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-btn">📥 Download Results CSV</a>'
+# Main App Interface
+st.title("🎯 AI-Powered Resume Screener")
+st.markdown("*Find the perfect candidates using advanced AI matching*")
 st.markdown("---")
+# Initialize screener
+if st.session_state.embedding_model is None:
+    screener = ResumeScreener()
+    st.session_state.embedding_model = screener.embedding_model
+    st.session_state.explanation_model = screener.explanation_model
+else:
+    screener = ResumeScreener()
+# Job Description Input
+st.header("📝 Step 1: Enter Job Description")
 job_description = st.text_area(
+    "Enter the complete job description or requirements:",
+    height=150,
+    placeholder="Paste the job description here, including required skills, experience, and qualifications..."
 )
+# Resume Input Options
+st.header("📄 Step 2: Upload Resumes")
+input_method = st.radio(
+    "Choose input method:",
+    ["📁 Upload Files", "🗂️ Load from CSV Dataset", "🔗 Load from Hugging Face Dataset"]
 )
 resume_texts = []
 file_names = []
+if input_method == "📁 Upload Files":
     uploaded_files = st.file_uploader(
         "Upload resume files",
+        type=["pdf", "docx", "txt"],
         accept_multiple_files=True,
+        help="Supported formats: PDF, DOCX, TXT"
     )
     if uploaded_files:
+        with st.spinner(f"🔄 Processing {len(uploaded_files)} files..."):
             for file in uploaded_files:
                 file_type = file.name.split('.')[-1].lower()
+                # Save temporary file
                 with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
                     tmp_file.write(file.getvalue())
                     tmp_path = tmp_file.name
+                # Extract text
                 text = screener.extract_text_from_file(tmp_path, file_type)
+                if text.strip():
                     resume_texts.append(text)
                     file_names.append(file.name)
+                # Cleanup
                 os.unlink(tmp_path)
+        if resume_texts:
+            st.success(f"✅ Successfully processed {len(resume_texts)} resumes")
+elif input_method == "🗂️ Load from CSV Dataset":
+    csv_file = st.file_uploader("Upload CSV file with resume data", type=["csv"])
+    if csv_file:
+        try:
+            df = pd.read_csv(csv_file)
+            st.write("**CSV Preview:**")
+            st.dataframe(df.head())
+            text_column = st.selectbox(
+                "Select column containing resume text:",
+                df.columns.tolist()
+            )
+            name_column = st.selectbox(
+                "Select column for candidate names/IDs (optional):",
+                ["Use Index"] + df.columns.tolist()
+            )
+            if st.button("🚀 Process CSV Data"):
+                with st.spinner("🔄 Processing CSV data..."):
+                    for idx, row in df.iterrows():
+                        text = str(row[text_column])
+                        if text and text.strip() and text.lower() != 'nan':
+                            resume_texts.append(text)
+                            if name_column == "Use Index":
+                                file_names.append(f"Resume_{idx}")
+                            else:
+                                file_names.append(str(row[name_column]))
+                if resume_texts:
+                    st.success(f"✅ Successfully loaded {len(resume_texts)} resumes from CSV")
+        except Exception as e:
+            st.error(f"❌ Error processing CSV: {str(e)}")
+elif input_method == "🔗 Load from Hugging Face Dataset":
+    st.markdown("**Quick Load:** [Resume Atlas Dataset](https://huggingface.co/datasets/ahmedheakl/resume-atlas)")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        dataset_name = st.text_input(
+            "Dataset name:",
+            value="ahmedheakl/resume-atlas",
+            help="Enter Hugging Face dataset name"
         )
+    with col2:
+        dataset_split = st.selectbox("Split:", ["train", "test", "validation"], index=0)
+    if st.button("🔗 Load from Hugging Face"):
+        try:
+            with st.spinner(f"🔄 Loading {dataset_name}..."):
+                dataset = load_dataset(dataset_name, split=dataset_split)
+            st.success(f"✅ Loaded dataset with {len(dataset)} entries")
+            st.write("**Dataset Preview:**")
+            # Show first few examples
+            preview_df = pd.DataFrame(dataset[:5])
+            st.dataframe(preview_df)
+            # Column selection
+            text_column = st.selectbox(
+                "Select column with resume text:",
+                dataset.column_names,
+                index=0 if 'resume_text' in dataset.column_names else 0
+            )
+            category_column = None
+            if 'category' in dataset.column_names:
+                category_column = st.selectbox(
+                    "Filter by category (optional):",
+                    ["All"] + list(set(dataset['category']))
                 )
+            max_samples = st.slider("Maximum samples to load:", 10, min(1000, len(dataset)), 100)
+            if st.button("🚀 Process Dataset"):
+                with st.spinner("🔄 Processing dataset..."):
+                    filtered_dataset = dataset
+                    # Apply category filter
+                    if category_column and category_column != "All":
+                        filtered_dataset = dataset.filter(lambda x: x['category'] == category_column)
+                    # Limit samples
+                    sample_indices = list(range(min(max_samples, len(filtered_dataset))))
+                    for idx in sample_indices:
+                        item = filtered_dataset[idx]
+                        text = str(item[text_column])
+                        if text and text.strip() and text.lower() != 'nan':
+                            resume_texts.append(text)
+                            # Use ID or index for naming
+                            if 'id' in item:
+                                file_names.append(f"Resume_{item['id']}")
+                            else:
+                                file_names.append(f"Resume_{idx}")
+                if resume_texts:
+                    st.success(f"✅ Successfully loaded {len(resume_texts)} resumes")
+        except Exception as e:
+            st.error(f"❌ Error loading dataset: {str(e)}")
+# Processing and Results
+if st.button("🔍 Find Best Candidates", disabled=not (job_description and resume_texts)):
+    if len(resume_texts) == 0:
+        st.error("❌ Please upload resumes first!")
+    elif not job_description.strip():
+        st.error("❌ Please enter a job description!")
+    else:
+        with st.spinner("🧠 AI is analyzing resumes..."):
+            # Calculate scores
+            hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
+                resume_texts, job_description
+            )
+            # Prepare results
+            results = []
+            for i, (name, text, hybrid_score, semantic_score, bm25_score) in enumerate(
+                zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores)
+            ):
+                # Extract skills
+                skills = screener.extract_skills(text, job_description)
+                # Generate explanation
+                explanation = ""
+                if use_explanation:
+                    explanation = screener.generate_explanation(
+                        text, job_description, hybrid_score, semantic_score, bm25_score, skills
+                    )
+                results.append({
+                    'rank': i + 1,
+                    'name': name,
+                    'score': hybrid_score,
+                    'semantic_score': semantic_score,
+                    'keyword_score': bm25_score,
+                    'skills': skills,
+                    'explanation': explanation,
+                    'text_preview': text[:300] + "..." if len(text) > 300 else text
+                })
+            # Sort by score
+            results.sort(key=lambda x: x['score'], reverse=True)
+            # Update ranks
+            for i, result in enumerate(results):
+                result['rank'] = i + 1
+            # Store in session state
+            st.session_state.results = results[:top_k]
+        st.success(f"🎉 Analysis complete! Found top {len(st.session_state.results)} candidates")
+# Display Results
 if st.session_state.results:
+    st.header("🏆 Top Candidates")
+    # Create summary dataframe
+    summary_data = []
     for result in st.session_state.results:
+        summary_data.append({
+            "Rank": result['rank'],
+            "Candidate": result['name'],
+            "Overall Score": f"{result['score']:.3f}",
+            "Semantic Score": f"{result['semantic_score']:.3f}",
+            "Keyword Score": f"{result['keyword_score']:.3f}",
+            "Key Skills": ", ".join(result['skills'][:3]) + ("..." if len(result['skills']) > 3 else ""),
         })
+    summary_df = pd.DataFrame(summary_data)
+    st.dataframe(summary_df, use_container_width=True)
+    # Download link
+    detailed_data = []
+    for result in st.session_state.results:
+        detailed_data.append({
+            "Rank": result['rank'],
+            "Candidate": result['name'],
+            "Overall_Score": result['score'],
+            "Semantic_Score": result['semantic_score'],
+            "Keyword_Score": result['keyword_score'],
+            "Skills": "; ".join(result['skills']),
+            "Explanation": result['explanation'],
+            "Resume_Preview": result['text_preview']
+        })
+    download_df = pd.DataFrame(detailed_data)
+    st.markdown(create_download_link(download_df), unsafe_allow_html=True)
+    # Detailed results
+    st.subheader("📋 Detailed Analysis")
+    for result in st.session_state.results:
+        with st.expander(f"🥇 #{result['rank']}: {result['name']} (Score: {result['score']:.3f})"):
+            col1, col2 = st.columns([1, 2])
             with col1:
+                st.metric("Overall Score", f"{result['score']:.3f}")
+                st.metric("Semantic Match", f"{result['semantic_score']:.3f}")
+                st.metric("Keyword Match", f"{result['keyword_score']:.3f}")
+                st.write("**🎯 Key Skills:**")
+                for skill in result['skills'][:8]:
+                    st.write(f"• {skill}")
             with col2:
+                if result['explanation']:
+                    st.write("**🤖 AI Analysis:**")
+                    st.info(result['explanation'])
+                st.write("**📄 Resume Preview:**")
+                st.text_area("", result['text_preview'], height=150, disabled=True, key=f"preview_{result['rank']}")
+    # Score visualization
+    if len(st.session_state.results) > 1:
+        st.subheader("📊 Score Visualization")
+        chart_data = pd.DataFrame({
+            'Candidate': [r['name'] for r in st.session_state.results],
+            'Overall Score': [r['score'] for r in st.session_state.results],
+            'Semantic Score': [r['semantic_score'] for r in st.session_state.results],
+            'Keyword Score': [r['keyword_score'] for r in st.session_state.results]
+        })
+        st.bar_chart(chart_data.set_index('Candidate'))
+# Memory cleanup
+if st.button("🧹 Clear Memory"):
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+    st.success("✅ Memory cleared!")
 # Footer
 st.markdown("---")
+st.markdown(
+    """
+    <div style='text-align: center; color: #666;'>
+        🚀 Powered by NVIDIA NV-Embed-v2 & Qwen3-14B | Built with Streamlit
+    </div>
+    """,
+    unsafe_allow_html=True
+)

explanation_generator.py DELETED Viewed

@@ -1,223 +0,0 @@
-"""
-Explanation Generator Module
-This module handles the generation of explanations for resume rankings
-using the Qwen3-14B model from Hugging Face.
-"""
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import os
-import re
-import sys
-# Use the alternative model loading approach
-try:
-    # Try to import the functions from alt_models.py
-    from alt_models import load_explanation_model
-    USE_ALT_MODELS = True
-except ImportError:
-    USE_ALT_MODELS = False
-    # If import fails, we'll use the original approach
-    # Add Replicate class workaround if not already defined
-    try:
-        from transformers.models.qwen2.modeling_qwen2 import Replicate
-    except (ImportError, AttributeError):
-        class Replicate(torch.nn.Module):
-            """Workaround class for missing Replicate in Qwen models"""
-            def __init__(self, module, num_replicas=1):
-                super().__init__()
-                self.module = module
-                self.num_replicas = num_replicas
-            def forward(self, *args, **kwargs):
-                return self.module(*args, **kwargs)
-        # Create module structure if it doesn't exist yet
-        parent_modules = [
-            "transformers.models",
-            "transformers.models.qwen2",
-        ]
-        # Create all parent modules
-        for module_path in parent_modules:
-            if module_path not in sys.modules:
-                sys.modules[module_path] = type('', (), {})
-        # Create and add the Replicate class
-        if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
-            sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
-        sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
-# Load Qwen3 model at initialization time
-print("Loading Qwen/Qwen3-14B model with 4-bit quantization...")
-QWEN_MODEL_NAME = "Qwen/Qwen3-14B"
-if USE_ALT_MODELS:
-    # Use the alternative loading approach
-    global_qwen_model, global_qwen_tokenizer = load_explanation_model(QWEN_MODEL_NAME)
-else:
-    # Use original approach
-    try:
-        # Configure 4-bit quantization for better performance
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True
-        )
-        # Load Qwen3 model and tokenizer
-        global_qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, trust_remote_code=True)
-        global_qwen_model = None
-        # Check if we have enough resources to load the model
-        if torch.cuda.is_available():
-            gpu_memory = torch.cuda.get_device_properties(0).total_memory
-            if gpu_memory >= 12 * (1024**3):  # 12 GB (reduced memory requirement compared to 32B model)
-                global_qwen_model = AutoModelForCausalLM.from_pretrained(
-                    QWEN_MODEL_NAME,
-                    quantization_config=quantization_config,
-                    device_map="auto",
-                    trust_remote_code=True,
-                    torch_dtype=torch.float16
-                )
-                print("Successfully loaded Qwen3-14B with 4-bit quantization")
-            else:
-                print("Not enough GPU memory, using template-based explanations")
-        else:
-            print("CUDA not available, using template-based explanations")
-    except Exception as e:
-        print(f"Error loading Qwen3-14B model: {str(e)}")
-        print("Falling back to template-based explanations.")
-        global_qwen_tokenizer = None
-        global_qwen_model = None
-class ExplanationGenerator:
-    def __init__(self, model_name="Qwen/Qwen3-14B"):
-        """Initialize the explanation generator with the specified model"""
-        self.model_name = model_name
-        # Use globally pre-loaded model and tokenizer
-        self.model = global_qwen_model
-        self.tokenizer = global_qwen_tokenizer
-        self.initialized = True
-    def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
-        """Generate explanation for why a resume was ranked highly"""
-        # Use the model if it's available
-        if self.model is not None and self.tokenizer is not None:
-            try:
-                # Prepare prompt for Qwen3-14B
-                prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
-                # Create messages for chat format
-                messages = [
-                    {"role": "user", "content": prompt}
-                ]
-                # Apply chat template with thinking mode enabled
-                text = self.tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=False,
-                    add_generation_prompt=True,
-                    enable_thinking=True
-                )
-                # Tokenize
-                inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
-                # Generate response with recommended parameters for thinking mode
-                output_ids = self.model.generate(
-                    **inputs,
-                    max_new_tokens=500,
-                    temperature=0.6,
-                    top_p=0.95,
-                    top_k=20
-                )
-                # Decode the response
-                response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
-                # Clean up the response
-                cleaned_response = self._clean_response(response)
-                return cleaned_response
-            except Exception as e:
-                print(f"Error generating explanation with Qwen3-14B: {str(e)}")
-                # Fall back to template-based explanation
-                return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
-        else:
-            # Use template-based explanation if model is not available
-            return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
-    def _create_prompt(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
-        """Create a prompt for the explanation generation"""
-        # Use only the first 1000 characters of the resume to keep prompt size manageable
-        resume_excerpt = resume_text[:1000] + "..." if len(resume_text) > 1000 else resume_text
-        prompt = f"""You are an AI assistant helping a recruiter understand why a candidate's resume was matched with a job posting.
-The resume has been assigned the following scores:
-- Overall Match Score: {score:.2f} out of 1.0
-- Semantic Relevance Score: {semantic_score:.2f} out of 1.0
-- Keyword Match Score: {keyword_score:.2f} out of 1.0
-The job description is:
-```
-{job_description}
-```
-Based on analysis, the resume contains these skills relevant to the job: {', '.join(skills)}
-Resume excerpt:
-```
-{resume_excerpt}
-```
-Please provide a short explanation (3-5 sentences) of why this resume received these scores and how well it matches the job requirements. Focus on the relationship between the candidate's experience and the job requirements."""
-        return prompt
-    def _clean_response(self, response):
-        """Clean the response from the model"""
-        # Remove any thinking or internal processing tokens
-        response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
-        # Limit to a reasonable length
-        if len(response) > 500:
-            sentences = response.split('.')
-            shortened = '.'.join(sentences[:5]) + '.'
-            return shortened
-        return response
-    def _generate_template_explanation(self, score, semantic_score, keyword_score, skills):
-        """Generate a template-based explanation when the model is not available"""
-        # Simple template-based explanation
-        if score > 0.8:
-            quality = "excellent"
-        elif score > 0.6:
-            quality = "good"
-        elif score > 0.4:
-            quality = "moderate"
-        else:
-            quality = "limited"
-        explanation = f"This resume shows {quality} alignment with the job requirements, with an overall score of {score:.2f}. "
-        if semantic_score > keyword_score:
-            explanation += f"The candidate's experience demonstrates strong semantic relevance ({semantic_score:.2f}) to the position, though specific keyword matches ({keyword_score:.2f}) could be improved. "
-        else:
-            explanation += f"The resume contains many relevant keywords ({keyword_score:.2f}), but could benefit from better contextual alignment ({semantic_score:.2f}) with the job requirements. "
-        if skills:
-            if len(skills) > 3:
-                explanation += f"Key skills identified include {', '.join(skills[:3])}, and {len(skills)-3} others that match the job requirements."
-            else:
-                explanation += f"Key skills identified include {', '.join(skills)}."
-        else:
-            explanation += "No specific skills were identified that directly match the requirements."
-        return explanation

fix_dependencies.py DELETED Viewed

@@ -1,76 +0,0 @@
-#!/usr/bin/env python
-"""
-Dependency fixer for Resume Screener and Skill Extractor
-This script ensures all dependencies are properly installed with compatible versions.
-"""
-import sys
-import subprocess
-import pkg_resources
-import os
-def install(package):
-    """Install a package using pip"""
-    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
-def install_with_message(package, message=None):
-    """Install a package with an optional message"""
-    if message:
-        print(f"\n{message}")
-    print(f"Installing {package}...")
-    install(package)
-def main():
-    print("Running dependency fixer for Resume Screener and Skill Extractor...")
-    # Install core dependencies first
-    install_with_message("pip==23.1.2", "Upgrading pip to ensure compatibility")
-    install_with_message("setuptools==68.0.0", "Installing compatible setuptools")
-    # Check if we're in a Hugging Face Space
-    in_hf_space = os.environ.get("SPACE_ID") is not None
-    # Install key libraries with specific versions to ensure compatibility
-    dependencies = [
-        ("streamlit==1.31.0", "Installing Streamlit for the web interface"),
-        ("pdfplumber==0.10.1", "Installing PDF processing libraries"),
-        ("PyPDF2==3.0.1", None),
-        ("python-docx==1.0.1", None),
-        ("rank-bm25==0.2.2", "Installing BM25 ranking library"),
-        ("tqdm==4.66.1", "Installing progress bar utility"),
-        ("faiss-cpu==1.7.4", "Installing FAISS for vector similarity search"),
-        ("huggingface-hub==0.20.3", "Installing Hugging Face Hub"),
-        ("transformers==4.36.2", "Installing Transformers"),
-        ("sentence-transformers==2.2.2", "Installing Sentence Transformers"),
-        ("torch==2.1.2", "Installing PyTorch"),
-        ("nltk==3.8.1", "Installing NLTK for text processing"),
-        ("pandas==2.1.3", "Installing data processing libraries"),
-        ("numpy==1.24.3", None),
-        ("plotly==5.18.0", "Installing visualization libraries"),
-        ("spacy==3.7.2", "Installing spaCy for NLP"),
-    ]
-    # Install all dependencies
-    for package, message in dependencies:
-        install_with_message(package, message)
-    # Download required NLTK data
-    print("\nDownloading NLTK data...")
-    install("nltk")
-    import nltk
-    nltk.download('punkt')
-    # Download spaCy model if not in a Hugging Face Space
-    # (Spaces should include this in the requirements.txt)
-    if not in_hf_space:
-        print("\nDownloading spaCy model...")
-        try:
-            subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
-        except:
-            install("https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz")
-    print("\nDependency installation complete!")
-    print("You can now run the Resume Screener with: streamlit run app.py")
-if __name__ == "__main__":
-    main()

requirements.txt CHANGED Viewed

@@ -1,22 +1,19 @@
 streamlit==1.31.0
 pdfplumber==0.10.1
 PyPDF2==3.0.1
 python-docx==1.0.1
-spacy==3.7.2
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
-transformers==4.48.0
-torch==2.1.2
 nltk==3.8.1
 faiss-cpu==1.7.4
 rank-bm25==0.2.2
-sentence-transformers==2.7.0
-plotly==5.18.0
 pandas==2.1.3
 numpy==1.24.3
 tqdm==4.66.1
 huggingface-hub==0.27.1
-einops
-bitsandbytes>=0.41.0
-accelerate>=0.23.0
-optimum>=1.13.1
-safetensors>=0.3.1

 streamlit==1.31.0
+transformers==4.48.0
+torch==2.1.2
 pdfplumber==0.10.1
 PyPDF2==3.0.1
 python-docx==1.0.1
 nltk==3.8.1
 faiss-cpu==1.7.4
 rank-bm25==0.2.2
 pandas==2.1.3
 numpy==1.24.3
 tqdm==4.66.1
 huggingface-hub==0.27.1
+bitsandbytes==0.44.1
+accelerate==0.27.2
+datasets==2.18.0
+sentence-transformers==2.7.0
+plotly==5.18.0
+base64