Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

App Files Files Community

root commited on May 21

Commit

0bfe6dd

1 Parent(s): ba2dfe6

ss

Browse files

Files changed (3) hide show

app.py +39 -153
explanation_generator.py +74 -132
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -26,6 +26,20 @@ try:
 except LookupError:
     nltk.download('punkt')
 # Set page configuration
 st.set_page_config(
     page_title="Resume Screener & Skill Extractor",
@@ -34,38 +48,6 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-# Hugging Face Spaces optimization
-RUNNING_ON_SPACES = os.environ.get('SPACE_ID') is not None
-if RUNNING_ON_SPACES:
-    st.sidebar.info("🚀 Running on Hugging Face Spaces")
-    # Set up cache directory structure
-    CACHE_DIR = os.path.join(os.getcwd(), ".cache")
-    HF_HOME = os.path.join(CACHE_DIR, "huggingface")
-    os.environ['TRANSFORMERS_CACHE'] = os.path.join(HF_HOME, "transformers")
-    os.environ['HF_HOME'] = HF_HOME
-    os.environ['HF_DATASETS_CACHE'] = os.path.join(HF_HOME, "datasets")
-    # Create cache directories if they don't exist
-    for dir_path in [CACHE_DIR, HF_HOME, os.environ['TRANSFORMERS_CACHE'], os.environ['HF_DATASETS_CACHE']]:
-        if not os.path.exists(dir_path):
-            os.makedirs(dir_path)
-    # Use downloaded models if available (avoid downloading on every run)
-    os.environ['TRANSFORMERS_OFFLINE'] = '1'
-    # Spaces optimization flags
-    USE_PIPELINE = True
-    OPTIMIZE_MEMORY = True
-    # Print setup information
-    print(f"Running on Hugging Face Spaces: {os.environ.get('SPACE_ID')}")
-    print(f"Cache directory: {CACHE_DIR}")
-    print(f"HF Home: {HF_HOME}")
-else:
-    USE_PIPELINE = False
-    OPTIMIZE_MEMORY = False
 # Sidebar for model selection and weights
 with st.sidebar:
     st.title("Configuration")
@@ -95,15 +77,9 @@ with st.sidebar:
     use_explanation = st.checkbox("Generate Explanations", value=True)
     use_faiss = st.checkbox("Use FAISS for fast search", value=True)
-    # Hugging Face Spaces optimization options
-    if not RUNNING_ON_SPACES:
-        st.subheader("Hugging Face Spaces Optimization")
-        USE_PIPELINE = st.checkbox("Use pipeline API for faster loading", value=USE_PIPELINE)
-        OPTIMIZE_MEMORY = st.checkbox("Optimize memory usage", value=OPTIMIZE_MEMORY)
     # Memory optimization options
     st.subheader("Memory Optimization")
-    memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=OPTIMIZE_MEMORY)
     clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
     gc_collect_interval = st.number_input(
         "Garbage collection interval (files)",
@@ -126,92 +102,31 @@ if 'job_description' not in st.session_state:
 if 'results' not in st.session_state:
     st.session_state.results = []
 if 'embedding_model' not in st.session_state:
-    st.session_state.embedding_model = None
 if 'tokenizer' not in st.session_state:
-    st.session_state.tokenizer = None
 if 'faiss_index' not in st.session_state:
     st.session_state.faiss_index = None
 if 'explanation_generator' not in st.session_state:
     st.session_state.explanation_generator = None
-if 'screener' not in st.session_state:
-    st.session_state.screener = None
 class ResumeScreener:
-    def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B", load_immediately=True):
         """Initialize the ResumeScreener with the specified embedding model"""
         self.embedding_model_name = embedding_model_name
         self.explanation_model_name = explanation_model_name
-        self.model = None
-        self.tokenizer = None
-        self.embedding_pipeline = None
         self.faiss_index = None
         self.embedding_size = None
         self.explanation_generator = None
-        # Load models immediately if requested
-        if load_immediately:
-            with st.spinner("Loading models at startup..."):
-                self.load_model()
-                if use_explanation:
-                    self.load_explanation_generator()
-    def load_model(self):
-        """Load the embedding model from Hugging Face"""
-        if st.session_state.embedding_model is None:
-            with st.spinner(f"Loading model {self.embedding_model_name}..."):
-                try:
-                    # First try to use pipeline for more efficient loading
-                    try:
-                        from transformers import pipeline
-                        self.embedding_pipeline = pipeline(
-                            "feature-extraction",
-                            model=self.embedding_model_name,
-                            trust_remote_code=True,
-                            device_map="auto"
-                        )
-                        print(f"Successfully loaded {self.embedding_model_name} with pipeline API")
-                        self.model = self.embedding_pipeline.model
-                        self.tokenizer = self.embedding_pipeline.tokenizer
-                    except Exception as pipe_e:
-                        print(f"Error loading with pipeline API: {str(pipe_e)}")
-                        print("Falling back to direct model loading...")
-                        if "sentence-transformers" in self.embedding_model_name:
-                            self.model = SentenceTransformer(self.embedding_model_name)
-                        else:
-                            self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
-                            self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
-                    st.session_state.embedding_model = self.model
-                    st.session_state.tokenizer = self.tokenizer
-                    if self.embedding_pipeline:
-                        st.session_state.embedding_pipeline = self.embedding_pipeline
-                    # Get embedding size
-                    if "sentence-transformers" in self.embedding_model_name:
-                        self.embedding_size = self.model.get_sentence_embedding_dimension()
-                    else:
-                        # For non-sentence-transformers, we'll determine this after first embedding
-                        pass
-                except Exception as e:
-                    st.error(f"Error loading model: {str(e)}")
-                    st.stop()
-        else:
-            self.model = st.session_state.embedding_model
-            self.tokenizer = st.session_state.tokenizer
-            if 'embedding_pipeline' in st.session_state:
-                self.embedding_pipeline = st.session_state.embedding_pipeline
-    def load_explanation_generator(self):
-        """Load the explanation generator if needed"""
         if use_explanation and st.session_state.explanation_generator is None:
-            with st.spinner(f"Loading explanation model {self.explanation_model_name}..."):
-                st.session_state.explanation_generator = ExplanationGenerator(
-                    self.explanation_model_name,
-                    load_immediately=True
-                )
-                self.explanation_generator = st.session_state.explanation_generator
         elif use_explanation:
             self.explanation_generator = st.session_state.explanation_generator
@@ -259,42 +174,18 @@ class ResumeScreener:
     def get_embedding(self, text):
         """Generate text embedding for a given text"""
-        # Try using pipeline first if available
-        if self.embedding_pipeline:
-            try:
-                # Pipeline returns list of list of embeddings, we want just one vector
-                embeddings = self.embedding_pipeline(
-                    text,
-                    padding=True,
-                    truncation=True,
-                    max_length=512
-                )
-                # Mean pooling across token dimension for BERT-like models
-                embedding_np = np.mean(embeddings[0], axis=0)
-                # Set embedding size if not set
-                if self.embedding_size is None:
-                    self.embedding_size = embedding_np.shape[0]
-                return embedding_np
-            except Exception as e:
-                print(f"Error using embedding pipeline: {str(e)}")
-                print("Falling back to direct embedding method...")
-        # Fall back to original method
-        if "sentence-transformers" in self.embedding_model_name:
-            # For sentence-transformers models
-            embedding = self.model.encode([text], convert_to_tensor=True, show_progress_bar=False)[0]
-            embedding_np = embedding.cpu().detach().numpy()
-            # Set embedding size if not set
-            if self.embedding_size is None:
-                self.embedding_size = embedding_np.shape[0]
-            return embedding_np
-        else:
             # For HuggingFace models
             inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
             with torch.no_grad():
                 outputs = self.model(**inputs)
@@ -318,6 +209,9 @@ class ResumeScreener:
                     self.embedding_size = embedding_np.shape[0]
                 return embedding_np
     def create_faiss_index(self, embeddings):
         """Create a FAISS index for fast similarity search"""
@@ -572,13 +466,8 @@ def get_huggingface_spaces_datasets():
 st.title("Resume Screener & Skill Extractor")
 st.markdown("---")
-# Initialize the resume screener at startup
-if st.session_state.screener is None:
-    with st.spinner("Initializing Resume Screener..."):
-        screener = ResumeScreener(embedding_model_name, explanation_model_name, load_immediately=True)
-        st.session_state.screener = screener
-else:
-    screener = st.session_state.screener
 # Job description input
 st.header("1. Enter Job Description")
@@ -902,9 +791,6 @@ elif upload_option == "Upload from Dataset":
 # Process button
 if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
-    with st.spinner("Loading embedding model..."):
-        screener.load_model()
     with st.spinner("Processing job description and resumes..."):
         # Get job description embedding
         job_embedding = screener.get_embedding(job_description)

 except LookupError:
     nltk.download('punkt')
+# Initialize embedding model at startup
+EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
+print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
+try:
+    # Load embedding model and tokenizer
+    global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
+    global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
+    print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
+except Exception as e:
+    print(f"Error loading embedding model: {str(e)}")
+    global_embedding_tokenizer = None
+    global_embedding_model = None
 # Set page configuration
 st.set_page_config(
     page_title="Resume Screener & Skill Extractor",
     initial_sidebar_state="expanded"
 )
 # Sidebar for model selection and weights
 with st.sidebar:
     st.title("Configuration")
     use_explanation = st.checkbox("Generate Explanations", value=True)
     use_faiss = st.checkbox("Use FAISS for fast search", value=True)
     # Memory optimization options
     st.subheader("Memory Optimization")
+    memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
     clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
     gc_collect_interval = st.number_input(
         "Garbage collection interval (files)",
 if 'results' not in st.session_state:
     st.session_state.results = []
 if 'embedding_model' not in st.session_state:
+    st.session_state.embedding_model = global_embedding_model
 if 'tokenizer' not in st.session_state:
+    st.session_state.tokenizer = global_embedding_tokenizer
 if 'faiss_index' not in st.session_state:
     st.session_state.faiss_index = None
 if 'explanation_generator' not in st.session_state:
     st.session_state.explanation_generator = None
 class ResumeScreener:
+    def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"):
         """Initialize the ResumeScreener with the specified embedding model"""
         self.embedding_model_name = embedding_model_name
         self.explanation_model_name = explanation_model_name
+        # Initialize with preloaded models
+        self.model = st.session_state.embedding_model
+        self.tokenizer = st.session_state.tokenizer
         self.faiss_index = None
         self.embedding_size = None
         self.explanation_generator = None
+        # Initialize explanation generator
         if use_explanation and st.session_state.explanation_generator is None:
+            with st.spinner("Initializing explanation generator..."):
+                st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
+            self.explanation_generator = st.session_state.explanation_generator
         elif use_explanation:
             self.explanation_generator = st.session_state.explanation_generator
     def get_embedding(self, text):
         """Generate text embedding for a given text"""
+        if self.model is None:
+            st.error("Embedding model not available. Please check your environment.")
+            return np.zeros(768)  # Default embedding size as fallback
+        try:
             # For HuggingFace models
             inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
+            # Move inputs to same device as model
+            device = next(self.model.parameters()).device
+            inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model(**inputs)
                     self.embedding_size = embedding_np.shape[0]
                 return embedding_np
+        except Exception as e:
+            st.error(f"Error generating embedding: {str(e)}")
+            return np.zeros(768)  # Default embedding size as fallback
     def create_faiss_index(self, embeddings):
         """Create a FAISS index for fast similarity search"""
 st.title("Resume Screener & Skill Extractor")
 st.markdown("---")
+# Initialize the resume screener
+screener = ResumeScreener(embedding_model_name, explanation_model_name)
 # Job description input
 st.header("1. Enter Job Description")
 # Process button
 if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
     with st.spinner("Processing job description and resumes..."):
         # Get job description embedding
         job_embedding = screener.get_embedding(job_description)

explanation_generator.py CHANGED Viewed

@@ -6,151 +6,93 @@ using the QwQ-32B model from Hugging Face.
 """
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
 import os
 import re
 class ExplanationGenerator:
-    def __init__(self, model_name="Qwen/QwQ-32B", load_immediately=True):
         """Initialize the explanation generator with the specified model"""
         self.model_name = model_name
-        self.model = None
-        self.tokenizer = None
-        self.text_generation_pipeline = None
-        self.initialized = False
-        # Load model immediately if requested
-        if load_immediately:
-            self.load_model()
-    def load_model(self):
-        """Load the model and tokenizer if not already loaded"""
-        if not self.initialized:
             try:
-                print(f"Loading explanation model: {self.model_name}")
-                # Set up 4-bit quantization configuration
-                quantization_config = BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                    bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type="nf4"
-                )
-                # Try using pipeline API for more efficient loading in Spaces
-                try:
-                    print("Attempting to load model with pipeline API...")
-                    self.text_generation_pipeline = pipeline(
-                        "text-generation",
-                        model=self.model_name,
-                        torch_dtype=torch.bfloat16,
-                        device_map="auto",
-                        trust_remote_code=True,
-                        quantization_config=quantization_config,
-                        model_kwargs={"attn_implementation": "eager"}  # Uses less memory
-                    )
-                    print(f"Successfully loaded {self.model_name} with pipeline API")
-                    # Pipeline includes both model and tokenizer
-                    self.tokenizer = self.text_generation_pipeline.tokenizer
-                    self.model = self.text_generation_pipeline.model
-                    self.initialized = True
-                    return
-                except Exception as pipe_e:
-                    print(f"Error loading with pipeline API: {str(pipe_e)}")
-                    print("Falling back to direct model loading...")
-                # Load tokenizer
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    self.model_name,
-                    trust_remote_code=True
                 )
-                # Try to load model with 4-bit quantization
-                try:
-                    self.model = AutoModelForCausalLM.from_pretrained(
-                        self.model_name,
-                        device_map="auto",
-                        trust_remote_code=True,
-                        quantization_config=quantization_config
-                    )
-                    print(f"Successfully loaded {self.model_name} with 4-bit quantization")
-                except Exception as quant_e:
-                    print(f"Error loading with 4-bit quantization: {str(quant_e)}")
-                    print("Trying to load model with 8-bit quantization...")
-                    # Fall back to 8-bit or CPU if 4-bit fails
-                    if torch.cuda.is_available():
-                        self.model = AutoModelForCausalLM.from_pretrained(
-                            self.model_name,
-                            device_map="auto",
-                            trust_remote_code=True,
-                            load_in_8bit=True
-                        )
-                        print(f"Successfully loaded {self.model_name} with 8-bit quantization")
-                    else:
-                        # Fall back to template-based solution if no GPU
-                        self.model = None
-                        print(f"Warning: Loading {self.model_name} on CPU is not recommended. Using template-based explanations instead.")
-                self.initialized = True
-            except Exception as e:
-                print(f"Error loading explanation model: {str(e)}")
-                print("Falling back to template-based explanations.")
-                self.model = None
-                self.initialized = True
-    def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
-        """Generate explanation for why a resume was ranked highly"""
-        # Check if we need to load the model
-        if not self.initialized:
-            self.load_model()
-        # If the model is loaded and available, use it for generating explanations
-        if self.model is not None:
-            try:
-                # Prepare prompt for QwQ-32B
-                prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
-                # Use pipeline API if available
-                if self.text_generation_pipeline is not None:
-                    outputs = self.text_generation_pipeline(
-                        prompt,
-                        max_new_tokens=300,
-                        temperature=0.6,
-                        top_p=0.95,
-                        top_k=30,
-                        do_sample=True,
-                        return_full_text=False
-                    )
-                    response = outputs[0]['generated_text']
-                else:
-                    # Create messages for chat format
-                    messages = [
-                        {"role": "user", "content": prompt}
-                    ]
-                    # Apply chat template
-                    text = self.tokenizer.apply_chat_template(
-                        messages,
-                        tokenize=False,
-                        add_generation_prompt=True
-                    )
-                    # Tokenize
-                    inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
-                    # Generate response
-                    output_ids = self.model.generate(
-                        **inputs,
-                        max_new_tokens=300,
-                        temperature=0.6,
-                        top_p=0.95,
-                        top_k=30
-                    )
-                    # Decode the response
-                    response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
                 # Clean up the response
                 cleaned_response = self._clean_response(response)
@@ -158,7 +100,7 @@ class ExplanationGenerator:
                 return cleaned_response
             except Exception as e:
-                print(f"Error generating explanation with model: {str(e)}")
                 # Fall back to template-based explanation
                 return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
         else:

 """
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import os
 import re
+# Load QwQ model at initialization time
+print("Loading Qwen/QwQ-32B model with 4-bit quantization...")
+QWQ_MODEL_NAME = "Qwen/QwQ-32B"
+try:
+    # Configure 4-bit quantization for better performance
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True
+    )
+    # Load QwQ model and tokenizer
+    global_qwq_tokenizer = AutoTokenizer.from_pretrained(QWQ_MODEL_NAME, trust_remote_code=True)
+    global_qwq_model = None
+    # Check if we have enough resources to load the model
+    if torch.cuda.is_available():
+        gpu_memory = torch.cuda.get_device_properties(0).total_memory
+        if gpu_memory >= 16 * (1024**3):  # 16 GB (reduced thanks to quantization)
+            global_qwq_model = AutoModelForCausalLM.from_pretrained(
+                QWQ_MODEL_NAME,
+                quantization_config=quantization_config,
+                device_map="auto",
+                trust_remote_code=True,
+                torch_dtype=torch.float16
+            )
+            print("Successfully loaded QwQ-32B with 4-bit quantization")
+        else:
+            print("Not enough GPU memory, using template-based explanations")
+    else:
+        print("CUDA not available, using template-based explanations")
+except Exception as e:
+    print(f"Error loading QwQ-32B model: {str(e)}")
+    print("Falling back to template-based explanations.")
+    global_qwq_tokenizer = None
+    global_qwq_model = None
 class ExplanationGenerator:
+    def __init__(self, model_name="Qwen/QwQ-32B"):
         """Initialize the explanation generator with the specified model"""
         self.model_name = model_name
+        # Use globally pre-loaded model and tokenizer
+        self.model = global_qwq_model
+        self.tokenizer = global_qwq_tokenizer
+        self.initialized = True
+    def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
+        """Generate explanation for why a resume was ranked highly"""
+        # Use the model if it's available
+        if self.model is not None and self.tokenizer is not None:
             try:
+                # Prepare prompt for QwQ-32B
+                prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
+                # Create messages for chat format
+                messages = [
+                    {"role": "user", "content": prompt}
+                ]
+                # Apply chat template
+                text = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
                 )
+                # Tokenize
+                inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
+                # Generate response
+                output_ids = self.model.generate(
+                    **inputs,
+                    max_new_tokens=300,
+                    temperature=0.6,
+                    top_p=0.95,
+                    top_k=30
+                )
+                # Decode the response
+                response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
                 # Clean up the response
                 cleaned_response = self._clean_response(response)
                 return cleaned_response
             except Exception as e:
+                print(f"Error generating explanation with QwQ-32B: {str(e)}")
                 # Fall back to template-based explanation
                 return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
         else:

requirements.txt CHANGED Viewed

@@ -17,4 +17,4 @@ tqdm==4.66.1
 huggingface-hub==0.25.0
 einops
 bitsandbytes>=0.41.0
-accelerate>=0.21.0

 huggingface-hub==0.25.0
 einops
 bitsandbytes>=0.41.0
+accelerate>=0.23.0