root commited on
Commit
eee21aa
·
1 Parent(s): 1d62827
Files changed (4) hide show
  1. alt_models.py +111 -0
  2. app.py +54 -25
  3. explanation_generator.py +75 -51
  4. requirements.txt +1 -0
alt_models.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Alternative model loading implementation without sys.modules patching
3
+ """
4
+
5
+ import torch
6
+ from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
+
8
+ def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
9
+ """Load the embedding model with a try-except approach instead of module patching"""
10
+ try:
11
+ print(f"Loading embedding model {model_name}...")
12
+
13
+ # Create a simple Replicate class that may be needed
14
+ class Replicate(torch.nn.Module):
15
+ def __init__(self, module, num_replicas=1):
16
+ super().__init__()
17
+ self.module = module
18
+ self.num_replicas = num_replicas
19
+
20
+ def forward(self, *args, **kwargs):
21
+ return self.module(*args, **kwargs)
22
+
23
+ # Try the standard loading approach
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
25
+ model = AutoModel.from_pretrained(
26
+ model_name,
27
+ trust_remote_code=True,
28
+ device_map="auto"
29
+ )
30
+
31
+ print(f"Successfully loaded {model_name}")
32
+ return model, tokenizer
33
+ except Exception as e:
34
+ # If the first approach fails, try with module.__dict__
35
+ try:
36
+ print(f"First loading approach failed: {str(e)}")
37
+ print("Trying alternative loading approach...")
38
+
39
+ # Import the module
40
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
41
+
42
+ # Dynamically get the module
43
+ model_class = AutoModel._MODEL_MAPPING[AutoModel._model_mapping[model_name]]
44
+
45
+ # Add Replicate to the module's namespace
46
+ model_class.__module_dict__ = {}
47
+ model_class.__module_dict__["Replicate"] = Replicate
48
+
49
+ # Try loading with the augmented namespace
50
+ model = model_class.from_pretrained(
51
+ model_name,
52
+ trust_remote_code=True,
53
+ device_map="auto"
54
+ )
55
+
56
+ print(f"Successfully loaded {model_name} with alternative approach")
57
+ return model, tokenizer
58
+ except Exception as e2:
59
+ print(f"Alternative loading approach also failed: {str(e2)}")
60
+ print(f"Could not load embedding model {model_name}")
61
+ return None, None
62
+
63
+ def load_explanation_model(model_name="Qwen/QwQ-32B"):
64
+ """Load the explanation model with a try-except approach instead of module patching"""
65
+ try:
66
+ print(f"Loading explanation model {model_name}...")
67
+
68
+ # Configure 4-bit quantization for better performance
69
+ quantization_config = BitsAndBytesConfig(
70
+ load_in_4bit=True,
71
+ bnb_4bit_quant_type="nf4",
72
+ bnb_4bit_compute_dtype=torch.float16,
73
+ bnb_4bit_use_double_quant=True
74
+ )
75
+
76
+ # Create a simple Replicate class that may be needed
77
+ class Replicate(torch.nn.Module):
78
+ def __init__(self, module, num_replicas=1):
79
+ super().__init__()
80
+ self.module = module
81
+ self.num_replicas = num_replicas
82
+
83
+ def forward(self, *args, **kwargs):
84
+ return self.module(*args, **kwargs)
85
+
86
+ # Try the standard loading approach
87
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
88
+
89
+ # Check if we have enough resources to load the model
90
+ if torch.cuda.is_available():
91
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory
92
+ if gpu_memory >= 16 * (1024**3): # 16 GB (reduced thanks to quantization)
93
+ model = AutoModelForCausalLM.from_pretrained(
94
+ model_name,
95
+ quantization_config=quantization_config,
96
+ device_map="auto",
97
+ trust_remote_code=True,
98
+ torch_dtype=torch.float16
99
+ )
100
+ print(f"Successfully loaded {model_name}")
101
+ return model, tokenizer
102
+ else:
103
+ print("Not enough GPU memory, using template-based explanations")
104
+ return None, tokenizer
105
+ else:
106
+ print("CUDA not available, using template-based explanations")
107
+ return None, tokenizer
108
+ except Exception as e:
109
+ print(f"Error loading explanation model: {str(e)}")
110
+ print("Falling back to template-based explanations.")
111
+ return None, None
app.py CHANGED
@@ -20,21 +20,45 @@ from docx import Document
20
  import csv
21
  import sys
22
 
23
- # Add Replicate class workaround
24
- class Replicate(torch.nn.Module):
25
- """Workaround class for missing Replicate in NV-Embed and Qwen models"""
26
- def __init__(self, module, num_replicas=1):
27
- super().__init__()
28
- self.module = module
29
- self.num_replicas = num_replicas
30
-
31
- def forward(self, *args, **kwargs):
32
- return self.module(*args, **kwargs)
33
-
34
- # Add the class to Python's built-ins
35
- sys.modules["transformers.models.nvembed.modeling_nvembed"].Replicate = Replicate
36
- sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
37
- sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  from explanation_generator import ExplanationGenerator
40
 
@@ -46,17 +70,22 @@ except LookupError:
46
 
47
  # Initialize embedding model at startup
48
  EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
49
- print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
50
 
51
- try:
52
- # Load embedding model and tokenizer
53
- global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
54
- global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
55
- print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
56
- except Exception as e:
57
- print(f"Error loading embedding model: {str(e)}")
58
- global_embedding_tokenizer = None
59
- global_embedding_model = None
 
 
 
 
 
 
60
 
61
  # Set page configuration
62
  st.set_page_config(
 
20
  import csv
21
  import sys
22
 
23
+ # Use the alternative model loading approach
24
+ try:
25
+ # Try to import the functions from alt_models.py
26
+ from alt_models import load_embedding_model, load_explanation_model
27
+ USE_ALT_MODELS = True
28
+ except ImportError:
29
+ USE_ALT_MODELS = False
30
+ # If import fails, we'll use the original approach
31
+ # Add Replicate class workaround
32
+ class Replicate(torch.nn.Module):
33
+ """Workaround class for missing Replicate in NV-Embed and Qwen models"""
34
+ def __init__(self, module, num_replicas=1):
35
+ super().__init__()
36
+ self.module = module
37
+ self.num_replicas = num_replicas
38
+
39
+ def forward(self, *args, **kwargs):
40
+ return self.module(*args, **kwargs)
41
+
42
+ # Create module structure if it doesn't exist yet
43
+ # Handle NVIDIA module
44
+ if "transformers.models.nvembed.modeling_nvembed" not in sys.modules:
45
+ # Create parent modules if they don't exist
46
+ if "transformers.models.nvembed" not in sys.modules:
47
+ sys.modules["transformers.models.nvembed"] = type('', (), {})
48
+ # Create the module we need
49
+ sys.modules["transformers.models.nvembed.modeling_nvembed"] = type('', (), {})
50
+
51
+ # Handle Qwen module
52
+ if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
53
+ # Create parent modules if they don't exist
54
+ if "transformers.models.qwen2" not in sys.modules:
55
+ sys.modules["transformers.models.qwen2"] = type('', (), {})
56
+ # Create the module we need
57
+ sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
58
+
59
+ # Add the class to modules
60
+ sys.modules["transformers.models.nvembed.modeling_nvembed"].Replicate = Replicate
61
+ sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
62
 
63
  from explanation_generator import ExplanationGenerator
64
 
 
70
 
71
  # Initialize embedding model at startup
72
  EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
 
73
 
74
+ if USE_ALT_MODELS:
75
+ # Use the alternative loading approach
76
+ global_embedding_model, global_embedding_tokenizer = load_embedding_model(EMBEDDING_MODEL_NAME)
77
+ else:
78
+ # Use the original approach
79
+ print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
80
+ try:
81
+ # Load embedding model and tokenizer
82
+ global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
83
+ global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
84
+ print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
85
+ except Exception as e:
86
+ print(f"Error loading embedding model: {str(e)}")
87
+ global_embedding_tokenizer = None
88
+ global_embedding_model = None
89
 
90
  # Set page configuration
91
  st.set_page_config(
explanation_generator.py CHANGED
@@ -11,64 +11,88 @@ import os
11
  import re
12
  import sys
13
 
14
- # Add Replicate class workaround if not already defined
15
  try:
16
- from transformers.models.qwen2.modeling_qwen2 import Replicate
17
- except (ImportError, AttributeError):
18
- class Replicate(torch.nn.Module):
19
- """Workaround class for missing Replicate in Qwen models"""
20
- def __init__(self, module, num_replicas=1):
21
- super().__init__()
22
- self.module = module
23
- self.num_replicas = num_replicas
24
-
25
- def forward(self, *args, **kwargs):
26
- return self.module(*args, **kwargs)
27
-
28
- # Add the class to modules
29
- if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
30
- sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
31
- sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Load QwQ model at initialization time
34
  print("Loading Qwen/QwQ-32B model with 4-bit quantization...")
35
  QWQ_MODEL_NAME = "Qwen/QwQ-32B"
36
 
37
- try:
38
- # Configure 4-bit quantization for better performance
39
- quantization_config = BitsAndBytesConfig(
40
- load_in_4bit=True,
41
- bnb_4bit_quant_type="nf4",
42
- bnb_4bit_compute_dtype=torch.float16,
43
- bnb_4bit_use_double_quant=True
44
- )
45
-
46
- # Load QwQ model and tokenizer
47
- global_qwq_tokenizer = AutoTokenizer.from_pretrained(QWQ_MODEL_NAME, trust_remote_code=True)
48
- global_qwq_model = None
49
-
50
- # Check if we have enough resources to load the model
51
- if torch.cuda.is_available():
52
- gpu_memory = torch.cuda.get_device_properties(0).total_memory
53
- if gpu_memory >= 16 * (1024**3): # 16 GB (reduced thanks to quantization)
54
- global_qwq_model = AutoModelForCausalLM.from_pretrained(
55
- QWQ_MODEL_NAME,
56
- quantization_config=quantization_config,
57
- device_map="auto",
58
- trust_remote_code=True,
59
- torch_dtype=torch.float16
60
- )
61
- print("Successfully loaded QwQ-32B with 4-bit quantization")
 
 
 
 
 
 
 
62
  else:
63
- print("Not enough GPU memory, using template-based explanations")
64
- else:
65
- print("CUDA not available, using template-based explanations")
66
-
67
- except Exception as e:
68
- print(f"Error loading QwQ-32B model: {str(e)}")
69
- print("Falling back to template-based explanations.")
70
- global_qwq_tokenizer = None
71
- global_qwq_model = None
72
 
73
  class ExplanationGenerator:
74
  def __init__(self, model_name="Qwen/QwQ-32B"):
 
11
  import re
12
  import sys
13
 
14
+ # Use the alternative model loading approach
15
  try:
16
+ # Try to import the functions from alt_models.py
17
+ from alt_models import load_explanation_model
18
+ USE_ALT_MODELS = True
19
+ except ImportError:
20
+ USE_ALT_MODELS = False
21
+ # If import fails, we'll use the original approach
22
+ # Add Replicate class workaround if not already defined
23
+ try:
24
+ from transformers.models.qwen2.modeling_qwen2 import Replicate
25
+ except (ImportError, AttributeError):
26
+ class Replicate(torch.nn.Module):
27
+ """Workaround class for missing Replicate in Qwen models"""
28
+ def __init__(self, module, num_replicas=1):
29
+ super().__init__()
30
+ self.module = module
31
+ self.num_replicas = num_replicas
32
+
33
+ def forward(self, *args, **kwargs):
34
+ return self.module(*args, **kwargs)
35
+
36
+ # Create module structure if it doesn't exist yet
37
+ parent_modules = [
38
+ "transformers.models",
39
+ "transformers.models.qwen2",
40
+ ]
41
+
42
+ # Create all parent modules
43
+ for module_path in parent_modules:
44
+ if module_path not in sys.modules:
45
+ sys.modules[module_path] = type('', (), {})
46
+
47
+ # Create and add the Replicate class
48
+ if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
49
+ sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
50
+ sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
51
 
52
  # Load QwQ model at initialization time
53
  print("Loading Qwen/QwQ-32B model with 4-bit quantization...")
54
  QWQ_MODEL_NAME = "Qwen/QwQ-32B"
55
 
56
+ if USE_ALT_MODELS:
57
+ # Use the alternative loading approach
58
+ global_qwq_model, global_qwq_tokenizer = load_explanation_model(QWQ_MODEL_NAME)
59
+ else:
60
+ # Use original approach
61
+ try:
62
+ # Configure 4-bit quantization for better performance
63
+ quantization_config = BitsAndBytesConfig(
64
+ load_in_4bit=True,
65
+ bnb_4bit_quant_type="nf4",
66
+ bnb_4bit_compute_dtype=torch.float16,
67
+ bnb_4bit_use_double_quant=True
68
+ )
69
+
70
+ # Load QwQ model and tokenizer
71
+ global_qwq_tokenizer = AutoTokenizer.from_pretrained(QWQ_MODEL_NAME, trust_remote_code=True)
72
+ global_qwq_model = None
73
+
74
+ # Check if we have enough resources to load the model
75
+ if torch.cuda.is_available():
76
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory
77
+ if gpu_memory >= 16 * (1024**3): # 16 GB (reduced thanks to quantization)
78
+ global_qwq_model = AutoModelForCausalLM.from_pretrained(
79
+ QWQ_MODEL_NAME,
80
+ quantization_config=quantization_config,
81
+ device_map="auto",
82
+ trust_remote_code=True,
83
+ torch_dtype=torch.float16
84
+ )
85
+ print("Successfully loaded QwQ-32B with 4-bit quantization")
86
+ else:
87
+ print("Not enough GPU memory, using template-based explanations")
88
  else:
89
+ print("CUDA not available, using template-based explanations")
90
+
91
+ except Exception as e:
92
+ print(f"Error loading QwQ-32B model: {str(e)}")
93
+ print("Falling back to template-based explanations.")
94
+ global_qwq_tokenizer = None
95
+ global_qwq_model = None
 
 
96
 
97
  class ExplanationGenerator:
98
  def __init__(self, model_name="Qwen/QwQ-32B"):
requirements.txt CHANGED
@@ -19,3 +19,4 @@ einops
19
  bitsandbytes>=0.41.0
20
  accelerate>=0.23.0
21
  optimum>=1.13.1
 
 
19
  bitsandbytes>=0.41.0
20
  accelerate>=0.23.0
21
  optimum>=1.13.1
22
+ safetensors>=0.3.1