Spaces:

drmasad
/

HAH-2024-v0.1

Runtime error

drmasad commited on Apr 27, 2024

Commit

50b517b

verified ·

1 Parent(s): c69da53

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -46,29 +46,25 @@ def load_model(selected_model_name):
     st.info("Loading the model, please wait...")
     model_name = model_links[selected_model_name]
-    # Configure the quantization and device settings
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_use_double_quant=False,
-        llm_int8_enable_fp32_cpu_offload=True,
-    )
-    # Device map to specify where each component should reside
-    device_map = {
-        'encoder': 'cuda',  # or 'cpu' if reducing GPU load is crucial
-        'decoder': 'cpu',
-        'embed_tokens': 'cpu'
-    }
-    # Load the model with the specified device map and quantization config
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        quantization_config=bnb_config,
-        device_map=device_map,
-        trust_remote_code=True,
-    )
     model.config.use_cache = False
     model = prepare_model_for_kbit_training(model)
@@ -93,6 +89,7 @@ def load_model(selected_model_name):
     return model, tokenizer
 # Load model and tokenizer
 model, tokenizer = load_model(selected_model)

     st.info("Loading the model, please wait...")
     model_name = model_links[selected_model_name]
+    # Load the model without a device map
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    # Check the availability of CUDA
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Manually move the model to the device
+    model = model.to(device)
+    # Apply quantization configuration if required
+    if device == 'cuda':  # Only apply BitsAndBytesConfig if on CUDA
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=False,
+            llm_int8_enable_fp32_cpu_offload=False,
+        )
+        # Assume quantization applies here, adjust as per actual use case
+        # model = apply_quantization(model, bnb_config)
     model.config.use_cache = False
     model = prepare_model_for_kbit_training(model)
     return model, tokenizer
 # Load model and tokenizer
 model, tokenizer = load_model(selected_model)