Spaces:

drmasad
/

HAH-2024-v0.1

Runtime error

drmasad commited on Apr 26, 2024

Commit

8dbaa52

verified ·

1 Parent(s): ee59722

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -49,24 +49,30 @@ st.sidebar.write(f"You're now chatting with **{selected_model}**")
 st.sidebar.markdown(model_info[selected_model]["description"])
 st.sidebar.image(model_info[selected_model]["logo"])
-# Load the appropriate model
 def load_model():
     model_name = model_links["HAH-2024-v0.1"]
     base_model = "mistralai/Mistral-7B-Instruct-v0.2"
-    # Load model with quantization configuration
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_use_double_quant=False,
     )
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         quantization_config=bnb_config,
         torch_dtype=torch.bfloat16,
-        device_map="auto",
         trust_remote_code=True,
     )
@@ -88,7 +94,6 @@ def load_model():
     return model, tokenizer
-model, tokenizer = load_model()
 # Initialize chat history
 if "messages" not in st.session_state:

 st.sidebar.markdown(model_info[selected_model]["description"])
 st.sidebar.image(model_info[selected_model]["logo"])
 def load_model():
     model_name = model_links["HAH-2024-v0.1"]
     base_model = "mistralai/Mistral-7B-Instruct-v0.2"
+    # Load model with quantization and device map configurations
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_use_double_quant=False,
+        llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading for certain parts
     )
+    # Custom device map to manage resource utilization
+    device_map = {
+        'encoder': 'cuda',      # Keep encoder on GPU
+        'decoder': 'cpu',       # Offload decoder to CPU if GPU RAM is insufficient
+    }
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         quantization_config=bnb_config,
         torch_dtype=torch.bfloat16,
+        device_map=device_map,  # Apply custom device map
         trust_remote_code=True,
     )
     return model, tokenizer
 # Initialize chat history
 if "messages" not in st.session_state: