mamkkl commited on
Commit
80ff3f3
·
verified ·
1 Parent(s): 5b76d0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -3
app.py CHANGED
@@ -1,11 +1,16 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import transformers
4
- from transformers import AutoTokenizer,GenerationConfig
5
  import torch
6
  from peft import PeftModel
7
  import spaces
8
  import torch
 
 
 
 
 
9
 
10
  num_gpus = torch.cuda.device_count()
11
  print(f"Number of available GPUs: {num_gpus}")
@@ -20,6 +25,12 @@ for i in range(num_gpus):
20
  print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
21
  print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
22
  print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
 
 
 
 
 
 
23
  """
24
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
25
  """
@@ -88,8 +99,7 @@ def loadModel():
88
  torch_dtype=torch.float16,
89
  cache_dir=cache_dir,
90
  device_map="auto",
91
- load_in_8bit=True,
92
- load_in_8bit_fp32_cpu_offload=True
93
  )
94
  #model = PeftModel.from_pretrained(
95
  # model,
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import transformers
4
+ from transformers import AutoTokenizer,GenerationConfig, BitsAndBytesConfig
5
  import torch
6
  from peft import PeftModel
7
  import spaces
8
  import torch
9
+ import bitsandbytes, accelerate
10
+
11
+ print(transformers.__version__) # Should be >= 4.26.0
12
+ print(bitsandbytes.__version__) # Should be >= 0.37.0
13
+ print(accelerate.__version__) # Should be >= 0.12.0
14
 
15
  num_gpus = torch.cuda.device_count()
16
  print(f"Number of available GPUs: {num_gpus}")
 
25
  print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
26
  print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
27
  print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
28
+
29
+ quantization_config = BitsAndBytesConfig(
30
+ load_in_8bit=True, # Enable 8-bit quantization
31
+ llm_int8_enable_fp32_cpu_offload=True # Enable FP32 CPU offloading
32
+ )
33
+
34
  """
35
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
36
  """
 
99
  torch_dtype=torch.float16,
100
  cache_dir=cache_dir,
101
  device_map="auto",
102
+ quantization_config=quantization_config
 
103
  )
104
  #model = PeftModel.from_pretrained(
105
  # model,