drmasad commited on
Commit
50b517b
·
verified ·
1 Parent(s): c69da53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -23
app.py CHANGED
@@ -46,29 +46,25 @@ def load_model(selected_model_name):
46
  st.info("Loading the model, please wait...")
47
  model_name = model_links[selected_model_name]
48
 
49
- # Configure the quantization and device settings
50
- bnb_config = BitsAndBytesConfig(
51
- load_in_4bit=True,
52
- bnb_4bit_quant_type="nf4",
53
- bnb_4bit_compute_dtype=torch.bfloat16,
54
- bnb_4bit_use_double_quant=False,
55
- llm_int8_enable_fp32_cpu_offload=True,
56
- )
57
-
58
- # Device map to specify where each component should reside
59
- device_map = {
60
- 'encoder': 'cuda', # or 'cpu' if reducing GPU load is crucial
61
- 'decoder': 'cpu',
62
- 'embed_tokens': 'cpu'
63
- }
64
-
65
- # Load the model with the specified device map and quantization config
66
- model = AutoModelForCausalLM.from_pretrained(
67
- model_name,
68
- quantization_config=bnb_config,
69
- device_map=device_map,
70
- trust_remote_code=True,
71
- )
72
 
73
  model.config.use_cache = False
74
  model = prepare_model_for_kbit_training(model)
@@ -93,6 +89,7 @@ def load_model(selected_model_name):
93
  return model, tokenizer
94
 
95
 
 
96
  # Load model and tokenizer
97
  model, tokenizer = load_model(selected_model)
98
 
 
46
  st.info("Loading the model, please wait...")
47
  model_name = model_links[selected_model_name]
48
 
49
+ # Load the model without a device map
50
+ model = AutoModelForCausalLM.from_pretrained(model_name)
51
+
52
+ # Check the availability of CUDA
53
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
54
+ # Manually move the model to the device
55
+ model = model.to(device)
56
+
57
+ # Apply quantization configuration if required
58
+ if device == 'cuda': # Only apply BitsAndBytesConfig if on CUDA
59
+ bnb_config = BitsAndBytesConfig(
60
+ load_in_4bit=True,
61
+ bnb_4bit_quant_type="nf4",
62
+ bnb_4bit_compute_dtype=torch.bfloat16,
63
+ bnb_4bit_use_double_quant=False,
64
+ llm_int8_enable_fp32_cpu_offload=False,
65
+ )
66
+ # Assume quantization applies here, adjust as per actual use case
67
+ # model = apply_quantization(model, bnb_config)
 
 
 
 
68
 
69
  model.config.use_cache = False
70
  model = prepare_model_for_kbit_training(model)
 
89
  return model, tokenizer
90
 
91
 
92
+
93
  # Load model and tokenizer
94
  model, tokenizer = load_model(selected_model)
95