Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

App Files Files Community

root commited on May 21

Commit

eee21aa

1 Parent(s): 1d62827

ss

Browse files

Files changed (4) hide show

alt_models.py +111 -0
app.py +54 -25
explanation_generator.py +75 -51
requirements.txt +1 -0

alt_models.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Alternative model loading implementation without sys.modules patching
+"""
+import torch
+from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
+    """Load the embedding model with a try-except approach instead of module patching"""
+    try:
+        print(f"Loading embedding model {model_name}...")
+        # Create a simple Replicate class that may be needed
+        class Replicate(torch.nn.Module):
+            def __init__(self, module, num_replicas=1):
+                super().__init__()
+                self.module = module
+                self.num_replicas = num_replicas
+            def forward(self, *args, **kwargs):
+                return self.module(*args, **kwargs)
+        # Try the standard loading approach
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModel.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            device_map="auto"
+        )
+        print(f"Successfully loaded {model_name}")
+        return model, tokenizer
+    except Exception as e:
+        # If the first approach fails, try with module.__dict__
+        try:
+            print(f"First loading approach failed: {str(e)}")
+            print("Trying alternative loading approach...")
+            # Import the module
+            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+            # Dynamically get the module
+            model_class = AutoModel._MODEL_MAPPING[AutoModel._model_mapping[model_name]]
+            # Add Replicate to the module's namespace
+            model_class.__module_dict__ = {}
+            model_class.__module_dict__["Replicate"] = Replicate
+            # Try loading with the augmented namespace
+            model = model_class.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                device_map="auto"
+            )
+            print(f"Successfully loaded {model_name} with alternative approach")
+            return model, tokenizer
+        except Exception as e2:
+            print(f"Alternative loading approach also failed: {str(e2)}")
+            print(f"Could not load embedding model {model_name}")
+            return None, None
+def load_explanation_model(model_name="Qwen/QwQ-32B"):
+    """Load the explanation model with a try-except approach instead of module patching"""
+    try:
+        print(f"Loading explanation model {model_name}...")
+        # Configure 4-bit quantization for better performance
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True
+        )
+        # Create a simple Replicate class that may be needed
+        class Replicate(torch.nn.Module):
+            def __init__(self, module, num_replicas=1):
+                super().__init__()
+                self.module = module
+                self.num_replicas = num_replicas
+            def forward(self, *args, **kwargs):
+                return self.module(*args, **kwargs)
+        # Try the standard loading approach
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        # Check if we have enough resources to load the model
+        if torch.cuda.is_available():
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory
+            if gpu_memory >= 16 * (1024**3):  # 16 GB (reduced thanks to quantization)
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    quantization_config=quantization_config,
+                    device_map="auto",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16
+                )
+                print(f"Successfully loaded {model_name}")
+                return model, tokenizer
+            else:
+                print("Not enough GPU memory, using template-based explanations")
+                return None, tokenizer
+        else:
+            print("CUDA not available, using template-based explanations")
+            return None, tokenizer
+    except Exception as e:
+        print(f"Error loading explanation model: {str(e)}")
+        print("Falling back to template-based explanations.")
+        return None, None

app.py CHANGED Viewed

@@ -20,21 +20,45 @@ from docx import Document
 import csv
 import sys
-# Add Replicate class workaround
-class Replicate(torch.nn.Module):
-    """Workaround class for missing Replicate in NV-Embed and Qwen models"""
-    def __init__(self, module, num_replicas=1):
-        super().__init__()
-        self.module = module
-        self.num_replicas = num_replicas
-    def forward(self, *args, **kwargs):
-        return self.module(*args, **kwargs)
-# Add the class to Python's built-ins
-sys.modules["transformers.models.nvembed.modeling_nvembed"].Replicate = Replicate
-sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
-sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
 from explanation_generator import ExplanationGenerator
@@ -46,17 +70,22 @@ except LookupError:
 # Initialize embedding model at startup
 EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
-print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
-try:
-    # Load embedding model and tokenizer
-    global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
-    global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
-    print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
-except Exception as e:
-    print(f"Error loading embedding model: {str(e)}")
-    global_embedding_tokenizer = None
-    global_embedding_model = None
 # Set page configuration
 st.set_page_config(

 import csv
 import sys
+# Use the alternative model loading approach
+try:
+    # Try to import the functions from alt_models.py
+    from alt_models import load_embedding_model, load_explanation_model
+    USE_ALT_MODELS = True
+except ImportError:
+    USE_ALT_MODELS = False
+    # If import fails, we'll use the original approach
+    # Add Replicate class workaround
+    class Replicate(torch.nn.Module):
+        """Workaround class for missing Replicate in NV-Embed and Qwen models"""
+        def __init__(self, module, num_replicas=1):
+            super().__init__()
+            self.module = module
+            self.num_replicas = num_replicas
+        def forward(self, *args, **kwargs):
+            return self.module(*args, **kwargs)
+    # Create module structure if it doesn't exist yet
+    # Handle NVIDIA module
+    if "transformers.models.nvembed.modeling_nvembed" not in sys.modules:
+        # Create parent modules if they don't exist
+        if "transformers.models.nvembed" not in sys.modules:
+            sys.modules["transformers.models.nvembed"] = type('', (), {})
+        # Create the module we need
+        sys.modules["transformers.models.nvembed.modeling_nvembed"] = type('', (), {})
+    # Handle Qwen module
+    if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
+        # Create parent modules if they don't exist
+        if "transformers.models.qwen2" not in sys.modules:
+            sys.modules["transformers.models.qwen2"] = type('', (), {})
+        # Create the module we need
+        sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
+    # Add the class to modules
+    sys.modules["transformers.models.nvembed.modeling_nvembed"].Replicate = Replicate
+    sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
 from explanation_generator import ExplanationGenerator
 # Initialize embedding model at startup
 EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
+if USE_ALT_MODELS:
+    # Use the alternative loading approach
+    global_embedding_model, global_embedding_tokenizer = load_embedding_model(EMBEDDING_MODEL_NAME)
+else:
+    # Use the original approach
+    print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
+    try:
+        # Load embedding model and tokenizer
+        global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
+        global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
+        print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
+    except Exception as e:
+        print(f"Error loading embedding model: {str(e)}")
+        global_embedding_tokenizer = None
+        global_embedding_model = None
 # Set page configuration
 st.set_page_config(

explanation_generator.py CHANGED Viewed

@@ -11,64 +11,88 @@ import os
 import re
 import sys
-# Add Replicate class workaround if not already defined
 try:
-    from transformers.models.qwen2.modeling_qwen2 import Replicate
-except (ImportError, AttributeError):
-    class Replicate(torch.nn.Module):
-        """Workaround class for missing Replicate in Qwen models"""
-        def __init__(self, module, num_replicas=1):
-            super().__init__()
-            self.module = module
-            self.num_replicas = num_replicas
-        def forward(self, *args, **kwargs):
-            return self.module(*args, **kwargs)
-    # Add the class to modules
-    if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
-        sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
-    sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
 # Load QwQ model at initialization time
 print("Loading Qwen/QwQ-32B model with 4-bit quantization...")
 QWQ_MODEL_NAME = "Qwen/QwQ-32B"
-try:
-    # Configure 4-bit quantization for better performance
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=True
-    )
-    # Load QwQ model and tokenizer
-    global_qwq_tokenizer = AutoTokenizer.from_pretrained(QWQ_MODEL_NAME, trust_remote_code=True)
-    global_qwq_model = None
-    # Check if we have enough resources to load the model
-    if torch.cuda.is_available():
-        gpu_memory = torch.cuda.get_device_properties(0).total_memory
-        if gpu_memory >= 16 * (1024**3):  # 16 GB (reduced thanks to quantization)
-            global_qwq_model = AutoModelForCausalLM.from_pretrained(
-                QWQ_MODEL_NAME,
-                quantization_config=quantization_config,
-                device_map="auto",
-                trust_remote_code=True,
-                torch_dtype=torch.float16
-            )
-            print("Successfully loaded QwQ-32B with 4-bit quantization")
         else:
-            print("Not enough GPU memory, using template-based explanations")
-    else:
-        print("CUDA not available, using template-based explanations")
-except Exception as e:
-    print(f"Error loading QwQ-32B model: {str(e)}")
-    print("Falling back to template-based explanations.")
-    global_qwq_tokenizer = None
-    global_qwq_model = None
 class ExplanationGenerator:
     def __init__(self, model_name="Qwen/QwQ-32B"):

 import re
 import sys
+# Use the alternative model loading approach
 try:
+    # Try to import the functions from alt_models.py
+    from alt_models import load_explanation_model
+    USE_ALT_MODELS = True
+except ImportError:
+    USE_ALT_MODELS = False
+    # If import fails, we'll use the original approach
+    # Add Replicate class workaround if not already defined
+    try:
+        from transformers.models.qwen2.modeling_qwen2 import Replicate
+    except (ImportError, AttributeError):
+        class Replicate(torch.nn.Module):
+            """Workaround class for missing Replicate in Qwen models"""
+            def __init__(self, module, num_replicas=1):
+                super().__init__()
+                self.module = module
+                self.num_replicas = num_replicas
+            def forward(self, *args, **kwargs):
+                return self.module(*args, **kwargs)
+        # Create module structure if it doesn't exist yet
+        parent_modules = [
+            "transformers.models",
+            "transformers.models.qwen2",
+        ]
+        # Create all parent modules
+        for module_path in parent_modules:
+            if module_path not in sys.modules:
+                sys.modules[module_path] = type('', (), {})
+        # Create and add the Replicate class
+        if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
+            sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
+        sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
 # Load QwQ model at initialization time
 print("Loading Qwen/QwQ-32B model with 4-bit quantization...")
 QWQ_MODEL_NAME = "Qwen/QwQ-32B"
+if USE_ALT_MODELS:
+    # Use the alternative loading approach
+    global_qwq_model, global_qwq_tokenizer = load_explanation_model(QWQ_MODEL_NAME)
+else:
+    # Use original approach
+    try:
+        # Configure 4-bit quantization for better performance
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True
+        )
+        # Load QwQ model and tokenizer
+        global_qwq_tokenizer = AutoTokenizer.from_pretrained(QWQ_MODEL_NAME, trust_remote_code=True)
+        global_qwq_model = None
+        # Check if we have enough resources to load the model
+        if torch.cuda.is_available():
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory
+            if gpu_memory >= 16 * (1024**3):  # 16 GB (reduced thanks to quantization)
+                global_qwq_model = AutoModelForCausalLM.from_pretrained(
+                    QWQ_MODEL_NAME,
+                    quantization_config=quantization_config,
+                    device_map="auto",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16
+                )
+                print("Successfully loaded QwQ-32B with 4-bit quantization")
+            else:
+                print("Not enough GPU memory, using template-based explanations")
         else:
+            print("CUDA not available, using template-based explanations")
+    except Exception as e:
+        print(f"Error loading QwQ-32B model: {str(e)}")
+        print("Falling back to template-based explanations.")
+        global_qwq_tokenizer = None
+        global_qwq_model = None
 class ExplanationGenerator:
     def __init__(self, model_name="Qwen/QwQ-32B"):

requirements.txt CHANGED Viewed

@@ -19,3 +19,4 @@ einops
 bitsandbytes>=0.41.0
 accelerate>=0.23.0
 optimum>=1.13.1

 bitsandbytes>=0.41.0
 accelerate>=0.23.0
 optimum>=1.13.1
+safetensors>=0.3.1