Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
import transformers
|
4 |
-
from transformers import AutoTokenizer,GenerationConfig
|
5 |
import torch
|
6 |
from peft import PeftModel
|
7 |
import spaces
|
8 |
import torch
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
num_gpus = torch.cuda.device_count()
|
11 |
print(f"Number of available GPUs: {num_gpus}")
|
@@ -20,6 +25,12 @@ for i in range(num_gpus):
|
|
20 |
print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
|
21 |
print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
22 |
print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
"""
|
24 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
25 |
"""
|
@@ -88,8 +99,7 @@ def loadModel():
|
|
88 |
torch_dtype=torch.float16,
|
89 |
cache_dir=cache_dir,
|
90 |
device_map="auto",
|
91 |
-
|
92 |
-
load_in_8bit_fp32_cpu_offload=True
|
93 |
)
|
94 |
#model = PeftModel.from_pretrained(
|
95 |
# model,
|
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
import transformers
|
4 |
+
from transformers import AutoTokenizer,GenerationConfig, BitsAndBytesConfig
|
5 |
import torch
|
6 |
from peft import PeftModel
|
7 |
import spaces
|
8 |
import torch
|
9 |
+
import bitsandbytes, accelerate
|
10 |
+
|
11 |
+
print(transformers.__version__) # Should be >= 4.26.0
|
12 |
+
print(bitsandbytes.__version__) # Should be >= 0.37.0
|
13 |
+
print(accelerate.__version__) # Should be >= 0.12.0
|
14 |
|
15 |
num_gpus = torch.cuda.device_count()
|
16 |
print(f"Number of available GPUs: {num_gpus}")
|
|
|
25 |
print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
|
26 |
print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
27 |
print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
28 |
+
|
29 |
+
quantization_config = BitsAndBytesConfig(
|
30 |
+
load_in_8bit=True, # Enable 8-bit quantization
|
31 |
+
llm_int8_enable_fp32_cpu_offload=True # Enable FP32 CPU offloading
|
32 |
+
)
|
33 |
+
|
34 |
"""
|
35 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
36 |
"""
|
|
|
99 |
torch_dtype=torch.float16,
|
100 |
cache_dir=cache_dir,
|
101 |
device_map="auto",
|
102 |
+
quantization_config=quantization_config
|
|
|
103 |
)
|
104 |
#model = PeftModel.from_pretrained(
|
105 |
# model,
|