Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

App Files Files Community

root commited on May 21

Commit

ba2dfe6

1 Parent(s): 53cdf96

ss

Browse files

Files changed (3) hide show

app.py +114 -13
explanation_generator.py +106 -49
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -34,6 +34,38 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
 # Sidebar for model selection and weights
 with st.sidebar:
     st.title("Configuration")
@@ -63,9 +95,15 @@ with st.sidebar:
     use_explanation = st.checkbox("Generate Explanations", value=True)
     use_faiss = st.checkbox("Use FAISS for fast search", value=True)
     # Memory optimization options
     st.subheader("Memory Optimization")
-    memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
     clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
     gc_collect_interval = st.number_input(
         "Garbage collection interval (files)",
@@ -95,31 +133,59 @@ if 'faiss_index' not in st.session_state:
     st.session_state.faiss_index = None
 if 'explanation_generator' not in st.session_state:
     st.session_state.explanation_generator = None
 class ResumeScreener:
-    def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"):
         """Initialize the ResumeScreener with the specified embedding model"""
         self.embedding_model_name = embedding_model_name
         self.explanation_model_name = explanation_model_name
         self.model = None
         self.tokenizer = None
         self.faiss_index = None
         self.embedding_size = None
         self.explanation_generator = None
     def load_model(self):
         """Load the embedding model from Hugging Face"""
         if st.session_state.embedding_model is None:
             with st.spinner(f"Loading model {self.embedding_model_name}..."):
                 try:
-                    if "sentence-transformers" in self.embedding_model_name:
-                        self.model = SentenceTransformer(self.embedding_model_name)
-                    else:
-                        self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
-                        self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
                     st.session_state.embedding_model = self.model
                     st.session_state.tokenizer = self.tokenizer
                     # Get embedding size
                     if "sentence-transformers" in self.embedding_model_name:
@@ -134,11 +200,18 @@ class ResumeScreener:
         else:
             self.model = st.session_state.embedding_model
             self.tokenizer = st.session_state.tokenizer
-        # Initialize explanation generator if needed
         if use_explanation and st.session_state.explanation_generator is None:
-            st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
-            self.explanation_generator = st.session_state.explanation_generator
         elif use_explanation:
             self.explanation_generator = st.session_state.explanation_generator
@@ -186,6 +259,29 @@ class ResumeScreener:
     def get_embedding(self, text):
         """Generate text embedding for a given text"""
         if "sentence-transformers" in self.embedding_model_name:
             # For sentence-transformers models
             embedding = self.model.encode([text], convert_to_tensor=True, show_progress_bar=False)[0]
@@ -476,8 +572,13 @@ def get_huggingface_spaces_datasets():
 st.title("Resume Screener & Skill Extractor")
 st.markdown("---")
-# Initialize the resume screener
-screener = ResumeScreener(embedding_model_name, explanation_model_name)
 # Job description input
 st.header("1. Enter Job Description")

     initial_sidebar_state="expanded"
 )
+# Hugging Face Spaces optimization
+RUNNING_ON_SPACES = os.environ.get('SPACE_ID') is not None
+if RUNNING_ON_SPACES:
+    st.sidebar.info("🚀 Running on Hugging Face Spaces")
+    # Set up cache directory structure
+    CACHE_DIR = os.path.join(os.getcwd(), ".cache")
+    HF_HOME = os.path.join(CACHE_DIR, "huggingface")
+    os.environ['TRANSFORMERS_CACHE'] = os.path.join(HF_HOME, "transformers")
+    os.environ['HF_HOME'] = HF_HOME
+    os.environ['HF_DATASETS_CACHE'] = os.path.join(HF_HOME, "datasets")
+    # Create cache directories if they don't exist
+    for dir_path in [CACHE_DIR, HF_HOME, os.environ['TRANSFORMERS_CACHE'], os.environ['HF_DATASETS_CACHE']]:
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+    # Use downloaded models if available (avoid downloading on every run)
+    os.environ['TRANSFORMERS_OFFLINE'] = '1'
+    # Spaces optimization flags
+    USE_PIPELINE = True
+    OPTIMIZE_MEMORY = True
+    # Print setup information
+    print(f"Running on Hugging Face Spaces: {os.environ.get('SPACE_ID')}")
+    print(f"Cache directory: {CACHE_DIR}")
+    print(f"HF Home: {HF_HOME}")
+else:
+    USE_PIPELINE = False
+    OPTIMIZE_MEMORY = False
 # Sidebar for model selection and weights
 with st.sidebar:
     st.title("Configuration")
     use_explanation = st.checkbox("Generate Explanations", value=True)
     use_faiss = st.checkbox("Use FAISS for fast search", value=True)
+    # Hugging Face Spaces optimization options
+    if not RUNNING_ON_SPACES:
+        st.subheader("Hugging Face Spaces Optimization")
+        USE_PIPELINE = st.checkbox("Use pipeline API for faster loading", value=USE_PIPELINE)
+        OPTIMIZE_MEMORY = st.checkbox("Optimize memory usage", value=OPTIMIZE_MEMORY)
     # Memory optimization options
     st.subheader("Memory Optimization")
+    memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=OPTIMIZE_MEMORY)
     clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
     gc_collect_interval = st.number_input(
         "Garbage collection interval (files)",
     st.session_state.faiss_index = None
 if 'explanation_generator' not in st.session_state:
     st.session_state.explanation_generator = None
+if 'screener' not in st.session_state:
+    st.session_state.screener = None
 class ResumeScreener:
+    def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B", load_immediately=True):
         """Initialize the ResumeScreener with the specified embedding model"""
         self.embedding_model_name = embedding_model_name
         self.explanation_model_name = explanation_model_name
         self.model = None
         self.tokenizer = None
+        self.embedding_pipeline = None
         self.faiss_index = None
         self.embedding_size = None
         self.explanation_generator = None
+        # Load models immediately if requested
+        if load_immediately:
+            with st.spinner("Loading models at startup..."):
+                self.load_model()
+                if use_explanation:
+                    self.load_explanation_generator()
     def load_model(self):
         """Load the embedding model from Hugging Face"""
         if st.session_state.embedding_model is None:
             with st.spinner(f"Loading model {self.embedding_model_name}..."):
                 try:
+                    # First try to use pipeline for more efficient loading
+                    try:
+                        from transformers import pipeline
+                        self.embedding_pipeline = pipeline(
+                            "feature-extraction",
+                            model=self.embedding_model_name,
+                            trust_remote_code=True,
+                            device_map="auto"
+                        )
+                        print(f"Successfully loaded {self.embedding_model_name} with pipeline API")
+                        self.model = self.embedding_pipeline.model
+                        self.tokenizer = self.embedding_pipeline.tokenizer
+                    except Exception as pipe_e:
+                        print(f"Error loading with pipeline API: {str(pipe_e)}")
+                        print("Falling back to direct model loading...")
+                        if "sentence-transformers" in self.embedding_model_name:
+                            self.model = SentenceTransformer(self.embedding_model_name)
+                        else:
+                            self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
+                            self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
                     st.session_state.embedding_model = self.model
                     st.session_state.tokenizer = self.tokenizer
+                    if self.embedding_pipeline:
+                        st.session_state.embedding_pipeline = self.embedding_pipeline
                     # Get embedding size
                     if "sentence-transformers" in self.embedding_model_name:
         else:
             self.model = st.session_state.embedding_model
             self.tokenizer = st.session_state.tokenizer
+            if 'embedding_pipeline' in st.session_state:
+                self.embedding_pipeline = st.session_state.embedding_pipeline
+    def load_explanation_generator(self):
+        """Load the explanation generator if needed"""
         if use_explanation and st.session_state.explanation_generator is None:
+            with st.spinner(f"Loading explanation model {self.explanation_model_name}..."):
+                st.session_state.explanation_generator = ExplanationGenerator(
+                    self.explanation_model_name,
+                    load_immediately=True
+                )
+                self.explanation_generator = st.session_state.explanation_generator
         elif use_explanation:
             self.explanation_generator = st.session_state.explanation_generator
     def get_embedding(self, text):
         """Generate text embedding for a given text"""
+        # Try using pipeline first if available
+        if self.embedding_pipeline:
+            try:
+                # Pipeline returns list of list of embeddings, we want just one vector
+                embeddings = self.embedding_pipeline(
+                    text,
+                    padding=True,
+                    truncation=True,
+                    max_length=512
+                )
+                # Mean pooling across token dimension for BERT-like models
+                embedding_np = np.mean(embeddings[0], axis=0)
+                # Set embedding size if not set
+                if self.embedding_size is None:
+                    self.embedding_size = embedding_np.shape[0]
+                return embedding_np
+            except Exception as e:
+                print(f"Error using embedding pipeline: {str(e)}")
+                print("Falling back to direct embedding method...")
+        # Fall back to original method
         if "sentence-transformers" in self.embedding_model_name:
             # For sentence-transformers models
             embedding = self.model.encode([text], convert_to_tensor=True, show_progress_bar=False)[0]
 st.title("Resume Screener & Skill Extractor")
 st.markdown("---")
+# Initialize the resume screener at startup
+if st.session_state.screener is None:
+    with st.spinner("Initializing Resume Screener..."):
+        screener = ResumeScreener(embedding_model_name, explanation_model_name, load_immediately=True)
+        st.session_state.screener = screener
+else:
+    screener = st.session_state.screener
 # Job description input
 st.header("1. Enter Job Description")

explanation_generator.py CHANGED Viewed

@@ -6,52 +6,95 @@ using the QwQ-32B model from Hugging Face.
 """
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
 import re
 class ExplanationGenerator:
-    def __init__(self, model_name="Qwen/QwQ-32B"):
         """Initialize the explanation generator with the specified model"""
         self.model_name = model_name
         self.model = None
         self.tokenizer = None
         self.initialized = False
     def load_model(self):
         """Load the model and tokenizer if not already loaded"""
         if not self.initialized:
             try:
-                # Check if we have enough VRAM for loading the model
-                if torch.cuda.is_available():
-                    gpu_memory = torch.cuda.get_device_properties(0).total_memory
-                    # QwQ-32B requires at least 32GB VRAM for full precision
-                    if gpu_memory >= 32 * (1024**3):  # 32 GB
-                        device = "cuda"
-                    else:
-                        device = "cpu"
-                else:
-                    device = "cpu"
                 # Load tokenizer
-                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-                # Load model based on available resources
-                if device == "cuda":
                     self.model = AutoModelForCausalLM.from_pretrained(
                         self.model_name,
-                        torch_dtype=torch.bfloat16,
                         device_map="auto",
-                        trust_remote_code=True
                     )
-                else:
-                    # Fall back to a simpler template-based solution if we can't load the model
-                    self.model = None
-                    print("Warning: Loading QwQ-32B on CPU is not recommended. Using template-based explanations instead.")
                 self.initialized = True
             except Exception as e:
-                print(f"Error loading QwQ-32B model: {str(e)}")
                 print("Falling back to template-based explanations.")
                 self.model = None
                 self.initialized = True
@@ -68,32 +111,46 @@ class ExplanationGenerator:
                 # Prepare prompt for QwQ-32B
                 prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
-                # Create messages for chat format
-                messages = [
-                    {"role": "user", "content": prompt}
-                ]
-                # Apply chat template
-                text = self.tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=False,
-                    add_generation_prompt=True
-                )
-                # Tokenize
-                inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
-                # Generate response
-                output_ids = self.model.generate(
-                    **inputs,
-                    max_new_tokens=300,
-                    temperature=0.6,
-                    top_p=0.95,
-                    top_k=30
-                )
-                # Decode the response
-                response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
                 # Clean up the response
                 cleaned_response = self._clean_response(response)
@@ -101,7 +158,7 @@ class ExplanationGenerator:
                 return cleaned_response
             except Exception as e:
-                print(f"Error generating explanation with QwQ-32B: {str(e)}")
                 # Fall back to template-based explanation
                 return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
         else:

 """
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
 import os
 import re
 class ExplanationGenerator:
+    def __init__(self, model_name="Qwen/QwQ-32B", load_immediately=True):
         """Initialize the explanation generator with the specified model"""
         self.model_name = model_name
         self.model = None
         self.tokenizer = None
+        self.text_generation_pipeline = None
         self.initialized = False
+        # Load model immediately if requested
+        if load_immediately:
+            self.load_model()
     def load_model(self):
         """Load the model and tokenizer if not already loaded"""
         if not self.initialized:
             try:
+                print(f"Loading explanation model: {self.model_name}")
+                # Set up 4-bit quantization configuration
+                quantization_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4"
+                )
+                # Try using pipeline API for more efficient loading in Spaces
+                try:
+                    print("Attempting to load model with pipeline API...")
+                    self.text_generation_pipeline = pipeline(
+                        "text-generation",
+                        model=self.model_name,
+                        torch_dtype=torch.bfloat16,
+                        device_map="auto",
+                        trust_remote_code=True,
+                        quantization_config=quantization_config,
+                        model_kwargs={"attn_implementation": "eager"}  # Uses less memory
+                    )
+                    print(f"Successfully loaded {self.model_name} with pipeline API")
+                    # Pipeline includes both model and tokenizer
+                    self.tokenizer = self.text_generation_pipeline.tokenizer
+                    self.model = self.text_generation_pipeline.model
+                    self.initialized = True
+                    return
+                except Exception as pipe_e:
+                    print(f"Error loading with pipeline API: {str(pipe_e)}")
+                    print("Falling back to direct model loading...")
                 # Load tokenizer
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    self.model_name,
+                    trust_remote_code=True
+                )
+                # Try to load model with 4-bit quantization
+                try:
                     self.model = AutoModelForCausalLM.from_pretrained(
                         self.model_name,
                         device_map="auto",
+                        trust_remote_code=True,
+                        quantization_config=quantization_config
                     )
+                    print(f"Successfully loaded {self.model_name} with 4-bit quantization")
+                except Exception as quant_e:
+                    print(f"Error loading with 4-bit quantization: {str(quant_e)}")
+                    print("Trying to load model with 8-bit quantization...")
+                    # Fall back to 8-bit or CPU if 4-bit fails
+                    if torch.cuda.is_available():
+                        self.model = AutoModelForCausalLM.from_pretrained(
+                            self.model_name,
+                            device_map="auto",
+                            trust_remote_code=True,
+                            load_in_8bit=True
+                        )
+                        print(f"Successfully loaded {self.model_name} with 8-bit quantization")
+                    else:
+                        # Fall back to template-based solution if no GPU
+                        self.model = None
+                        print(f"Warning: Loading {self.model_name} on CPU is not recommended. Using template-based explanations instead.")
                 self.initialized = True
             except Exception as e:
+                print(f"Error loading explanation model: {str(e)}")
                 print("Falling back to template-based explanations.")
                 self.model = None
                 self.initialized = True
                 # Prepare prompt for QwQ-32B
                 prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
+                # Use pipeline API if available
+                if self.text_generation_pipeline is not None:
+                    outputs = self.text_generation_pipeline(
+                        prompt,
+                        max_new_tokens=300,
+                        temperature=0.6,
+                        top_p=0.95,
+                        top_k=30,
+                        do_sample=True,
+                        return_full_text=False
+                    )
+                    response = outputs[0]['generated_text']
+                else:
+                    # Create messages for chat format
+                    messages = [
+                        {"role": "user", "content": prompt}
+                    ]
+                    # Apply chat template
+                    text = self.tokenizer.apply_chat_template(
+                        messages,
+                        tokenize=False,
+                        add_generation_prompt=True
+                    )
+                    # Tokenize
+                    inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
+                    # Generate response
+                    output_ids = self.model.generate(
+                        **inputs,
+                        max_new_tokens=300,
+                        temperature=0.6,
+                        top_p=0.95,
+                        top_k=30
+                    )
+                    # Decode the response
+                    response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
                 # Clean up the response
                 cleaned_response = self._clean_response(response)
                 return cleaned_response
             except Exception as e:
+                print(f"Error generating explanation with model: {str(e)}")
                 # Fall back to template-based explanation
                 return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
         else:

requirements.txt CHANGED Viewed

@@ -16,3 +16,5 @@ numpy==1.24.3
 tqdm==4.66.1
 huggingface-hub==0.25.0
 einops

 tqdm==4.66.1
 huggingface-hub==0.25.0
 einops
+bitsandbytes>=0.41.0
+accelerate>=0.21.0